# Generating embeddings for SIB SPARQL queries

Using `fastembed-gpu` library, FlagEmbedding large 1.5 embedding model, and Qdrant vectorstore

Example code for loading the large Babel concepts synonyms dataset to Qdrant: https://github.com/vemonet/concept-resolver/blob/main/src/babel_load.py

Questions:
* Which info can I get from the UniProt endpoint?
* Give me an example to access cross references from the UniProt SPARQL endpoint to all the databases available in the endpoint

In [7]:
import re

from bs4 import BeautifulSoup
from SPARQLWrapper import JSON, SPARQLWrapper

# system_prompt = """You are Expasy, an assistant that helps users to query the databases from the Swiss Institute of Bioinformatics, such as UniProt or Bgee.
# When writing the SPARQL query try to factorize the predicates/objects of a subject as much as possible, so that the user can understand the query and the results.
# """
# examples_prompt: str = "Here are a list of questions and queries that Expasy has learned to answer, use them as base when answering the question from the user:"

endpoints = {
    "UniProt": "https://sparql.uniprot.org/sparql/",
    "Bgee": "https://www.bgee.org/sparql/",
}

get_queries = """PREFIX sh: <http://www.w3.org/ns/shacl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT DISTINCT ?comment ?query
WHERE
{
    ?sq a sh:SPARQLExecutable ;
        rdfs:label|rdfs:comment ?comment ;
        sh:select|sh:ask|sh:construct|sh:describe ?query .
}"""

get_prefixes = """PREFIX sh: <http://www.w3.org/ns/shacl#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?prefix ?namespace
WHERE {
    [] sh:namespace ?namespace ;
        sh:prefix ?prefix .
} ORDER BY ?prefix"""

def remove_a_tags(html_text: str) -> str:
    """Remove all <a> tags from the queries descriptions"""
    soup = BeautifulSoup(html_text, "html.parser")
    for a_tag in soup.find_all("a"):
        a_tag.replace_with(a_tag.text)
    return soup.get_text()

queries = []
for endpoint_name, endpoint_url in endpoints.items():
    sparql_endpoint = SPARQLWrapper(endpoint_url)
    sparql_endpoint.setReturnFormat(JSON)

    sparql_endpoint.setQuery(get_prefixes)
    results = sparql_endpoint.query().convert()
    prefix_map = {}
    for row in results["results"]["bindings"]:
        prefix_map[row["prefix"]["value"]] = row["namespace"]["value"]

    sparql_endpoint.setQuery(get_queries)
    results = sparql_endpoint.query().convert()
    print(f"Found {len(results['results']['bindings'])} queries for {endpoint_url}")

    for row in results["results"]["bindings"]:
        query = row["query"]["value"]
        # Add prefixes to queries
        for prefix, namespace in prefix_map.items():
            prefix_str = f"PREFIX {prefix}: <{namespace}>"
            if not re.search(prefix_str, query) and re.search(f"[(| |\u00a0|/]{prefix}:", query):
                query = f"{prefix_str}\n{query}"
        queries.append({
            "endpoint": endpoint_url,
            "comment": f"{endpoint_name}: {remove_a_tags(row['comment']['value'])}",
            "query": query,
        })

# print(queries)

Found 60 queries for https://sparql.uniprot.org/sparql/
Found 19 queries for https://www.bgee.org/sparql/
[{'endpoint': 'https://sparql.uniprot.org/sparql/', 'comment': 'UniProt: Was any UniProt entry integrated on the 9th of January 2013', 'query': "PREFIX up: <http://purl.uniprot.org/core/>\nASK\nWHERE\n{\n\t?protein a up:Protein .\n\t?protein up:created '2013-01-09'^^xsd:date\n}"}, {'endpoint': 'https://sparql.uniprot.org/sparql/', 'comment': "UniProt: Construct new triples of the type 'HumanProtein' from all human UniProt entries", 'query': 'PREFIX up: <http://purl.uniprot.org/core/>\nPREFIX taxon: <http://purl.uniprot.org/taxonomy/>\nCONSTRUCT\n{\n\t?protein a up:HumanProtein .\n}\nWHERE\n{\n\t?protein a up:Protein .\n\t?protein up:organism taxon:9606\n}'}, {'endpoint': 'https://sparql.uniprot.org/sparql/', 'comment': 'UniProt: Select all triples that relate to the EMBL CDS entry AA089367.1: ', 'query': 'DESCRIBE <http://purl.uniprot.org/embl-cds/AAO89367.1>\n'}, {'endpoint': 'htt

  soup = BeautifulSoup(html_text, "html.parser")


In [None]:
from fastembed import TextEmbedding
from qdrant_client import QdrantClient, models
from qdrant_client.http.models import (
    Distance,
    VectorParams,
)

# https://qdrant.github.io/fastembed/examples/Supported_Models/
# TextEmbedding.list_supported_models()

embedding_model = TextEmbedding("BAAI/bge-large-en-v1.5")
embed_dimensions = 1024

vectordb = QdrantClient(
    host="qdrant", # Running on the same docker network with compose
    prefer_grpc=True,
)

collection_name="expasy-queries"
if not vectordb.collection_exists(collection_name):
    vectordb.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=embed_dimensions, distance=Distance.COSINE),
    )

questions = [q["comment"] for q in queries]
print(len(questions))
output = embedding_model.embed(questions)
print("Done generating embeddings")

vectordb.upsert(
    collection_name=collection_name,
    points=models.Batch(
        ids=list(range(1, len(queries) + 1)),
        vectors=[embeddings.tolist() for embeddings in output],
        payloads=queries,
    ),
)

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 14383.76it/s]


70
Done generating embeddings


UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)