# Get most relevant SIB SPARQL queries using similarity search

Using `fastembed-gpu` library, FlagEmbedding large 1.5 embedding model, and Qdrant vectorstore

In [12]:

from fastembed import TextEmbedding
from qdrant_client import QdrantClient

embedding_model = TextEmbedding("BAAI/bge-large-en-v1.5")
embedding_dimensions = 1024

vectordb = QdrantClient(
    host="qdrant", # Running on the same docker network with compose
    prefer_grpc=True,
)
collection_name="expasy-queries"

print(f"VectorDB loaded with {vectordb.get_collection(collection_name).points_count} vectors")

search_queries = [
    "Give me an example to access cross references from the UniProt SPARQL endpoint to all the databases available in the endpoint",
    "Give me an example to access cross references from the Bgee SPARQL endpoint to all the databases available in the endpoint",
]

for search_query in search_queries:
    print(search_query)
    query_embeddings = list(embedding_model.embed([search_query]))
    hits = vectordb.search(
        collection_name=collection_name,
        query_vector=query_embeddings[0],
        limit=10,
    )
    for hit in hits:
        print(f"{hit.score:.3f}", hit.payload)
    print()

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 24672.38it/s]


VectorDB loaded with 70 vectors
Give me an example to access cross references from the UniProt SPARQL endpoint to all the databases available in the endpoint
0.754 {'comment': 'UniProt: Select the average number of cross-references to the PDB database of UniProt entries that have at least one cross-reference to the PDB database', 'endpoint': 'https://sparql.uniprot.org/sparql/', 'query': 'PREFIX up: <http://purl.uniprot.org/core/>\nPREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\nSELECT (AVG(?linksToPdbPerEntry) AS ?avgLinksToPdbPerEntry)\nWHERE\n{\n\tSELECT ?protein (COUNT(DISTINCT ?db) AS ?linksToPdbPerEntry)\n\tWHERE\n\t{\n\t\t?protein a up:Protein .\n\t\t?protein rdfs:seeAlso ?db .\n\t\t?db up:database <http://purl.uniprot.org/database/PDB> .\n\t}\n\tGROUP BY ?protein ORDER BY DESC(?linksToPdbPerEntry)\n}'}
0.747 {'comment': 'UniProt: Select a mapping of UniProt to PDB entries using the UniProt cross-references to the PDB database', 'endpoint': 'https://sparql.uniprot.org/spar