In [1]:
entities_list = [
        # {
        #     "uniprot_disease": {
        #         "uri": "http://purl.uniprot.org/core/Disease",
        #         "label": "Disease",
        #         "description": "The preferred names of diseases.",
        #         "endpoint": "https://sparql.uniprot.org/sparql/",
        #         "pagination": True,
        #         "query": """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        #                     PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
        #                     PREFIX up: <http://purl.uniprot.org/core/>
        #                     SELECT ?uri ?label ?type WHERE {
        #                         ?uri a up:Disease ;
        #                             skos:prefLabel ?label .
        #                     }"""
        #     }
        # },
        # {
        #     "uniprot_taxon": {
        #         "uri": "http://purl.uniprot.org/core/Taxon",
        #         "label": "species",
        #         "description": "taxon scientific names",
        #         "endpoint": "https://sparql.uniprot.org/sparql/",
        #         "pagination": True,
        #         "query": """PREFIX up: <http://purl.uniprot.org/core/>
        #                     SELECT ?uri ?label
        #                     WHERE {
        #                         ?uri a up:Taxon ;
        #                             up:scientificName ?label .
        #                     }"""
        #     }
        # },
        # {
        #     "rhea_reaction": {
        #         "uri": "http://rdf.rhea-db.org/",
        #         "label": "reactions",
        #         "description": "Reactions in RHEA.",
        #         "endpoint": "https://sparql.uniprot.org/sparql/",
        #         "pagination": True,
        #         "query": """PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        #                     PREFIX rh:   <http://rdf.rhea-db.org/>
        #                     SELECT DISTINCT ?uri ?label
        #                     WHERE {
        #                         ?uri rdfs:subClassOf rh:Reaction .
        #                         ?uri rdfs:label ?label .
        #                     }"""
        #     }
        # },
        {
            "chebi_chemical_entities": {
                "uri": "http://purl.obolibrary.org/obo/CHEBI",
                "label": "chemical entities",
                "description": "Chemical entities in ChEBI.",
                "endpoint": "https://sparql.uniprot.org/sparql/",
                "pagination": True,
                "query": """
                        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
                        PREFIX rh:   <http://rdf.rhea-db.org/>
                        SELECT DISTINCT ?uri ?label
                        WHERE {
                            # Restrict to things that are recognized as Rhea reactions
                            ?reaction rdfs:subClassOf rh:Reaction ;
                                    rh:side        ?side .
                            # Each reaction side contains one or more participants
                            ?side rh:contains ?participant .
                            # Each participant is linked to a ChEBI entity via 'rh:compound' 
                            ?participant rh:compound ?compound .
                            # 'rh:chebi' points to the ChEBI identifier (e.g., http://purl.obolibrary.org/obo/CHEBI_15377)
                            ?compound rh:chebi ?uri .
                            ?uri rdfs:label ?label .
                        }"""
            }
        }
    ]

In [2]:
from endpoint_loader import *
import pandas as pd
entities = load_entities_from_endpoints(entities_list)

df = pd.DataFrame(entities)

display(df.head())

print(f"Total entities: {len(df)}")
print(f"Columns: {df.columns.tolist()}")
print(f"Entity types: {df['entity_type'].unique()}")

Flattened 1 entities for indexing
Flattened entities: [{'uri': 'http://purl.obolibrary.org/obo/CHEBI', 'label': 'chemical entities', 'description': 'Chemical entities in ChEBI.', 'endpoint': 'https://sparql.uniprot.org/sparql/', 'pagination': True, 'query': "\n                        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n                        PREFIX rh:   <http://rdf.rhea-db.org/>\n                        SELECT DISTINCT ?uri ?label\n                        WHERE {\n                            # Restrict to things that are recognized as Rhea reactions\n                            ?reaction rdfs:subClassOf rh:Reaction ;\n                                    rh:side        ?side .\n                            # Each reaction side contains one or more participants\n                            ?side rh:contains ?participant .\n                            # Each participant is linked to a ChEBI entity via 'rh:compound' \n                            ?participant rh:compound 

Unnamed: 0,label,uri,endpoint_url,entity_type,description
0,2-O-(alpha-D-mannopyranosyl)-1-phosphatidyl-1D...,http://purl.obolibrary.org/obo/CHEBI_87673,https://sparql.uniprot.org/sparql/,literal,Chemical entities in ChEBI.
1,"(3E,5Z)-tetradecadienoyl-CoA(4-)",http://purl.obolibrary.org/obo/CHEBI_71586,https://sparql.uniprot.org/sparql/,literal,Chemical entities in ChEBI.
2,(5Z)-tetradecenoyl-CoA(4-),http://purl.obolibrary.org/obo/CHEBI_84650,https://sparql.uniprot.org/sparql/,literal,Chemical entities in ChEBI.
3,(25R)-3beta-hydroxycholest-5-en-7-one-26-al,http://purl.obolibrary.org/obo/CHEBI_87677,https://sparql.uniprot.org/sparql/,literal,Chemical entities in ChEBI.
4,(25R)-3beta-hydroxycholest-5-en-7-one-26-oate,http://purl.obolibrary.org/obo/CHEBI_87678,https://sparql.uniprot.org/sparql/,literal,Chemical entities in ChEBI.


Total entities: 12216
Columns: ['label', 'uri', 'endpoint_url', 'entity_type', 'description']
Entity types: ['literal']


In [3]:
from entity_indexing_pipeline_v3 import EmbeddingPipeline
import ray


ray.init(ignore_reinit_error=True)

pipeline = EmbeddingPipeline()
pipeline.add_documents(entities, batch_size=300)

ray.shutdown()

2025-03-28 18:13:45,483	INFO worker.py:1843 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


Creating embedding workers...
Initializing Qdrant client...
No existing upload log found, starting fresh
Collection 'biomedical_entities' exists, recreating...
Creating new collection 'biomedical_entities'
Collection initialized successfully
Adding 12216 documents in batches of 300
Chunk size: 100
Time taken to generate dense embeddings with Ray Distributed Computing: 9.510628700256348 seconds
Time taken to generate sparse embeddings with Ray Distributed Computing: 154.10256695747375 seconds
Creating points...
Upserting points...
✓ Batch 1/41 - Total uploaded: 300
Chunk size: 100
Time taken to generate dense embeddings with Ray Distributed Computing: 7.686643838882446 seconds


KeyboardInterrupt: 

In [4]:
ray.shutdown()