UniProt (Universal Protein Resource) is a comprehensive, high-quality database of protein sequence and functional information. It provides detailed annotations on protein functions, structures, interactions, and taxonomy, integrating data from multiple sources. UniProt is widely used in bioinformatics, molecular biology, and biomedical research for studying proteins across different organisms. The database is accessible via a SPARQL endpoint, allowing structured queries on protein-related data.

In [12]:
from SPARQLWrapper import SPARQLWrapper, JSON  # Re-import SPARQLWrapper
import pandas as pd
import httpx
import os
import certifi
from qdrant_client import QdrantClient, models
from langchain_qdrant import QdrantVectorStore, RetrievalMode, FastEmbedSparse
from langchain_community.embeddings import FastEmbedEmbeddings
import time 
from fastembed import SparseTextEmbedding, SparseEmbedding
from typing import List
import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


# set parameters
collection_name = "indexed_classes"


os.environ['SSL_CERT_FILE'] = certifi.where()

endpoint_url = "https://sparql.uniprot.org/sparql"
sparql = SPARQLWrapper(endpoint_url)  # Initialize SPARQLWrapper
sparql.setReturnFormat(JSON)  # Set return format to JSON


In [13]:
SparseTextEmbedding.list_supported_models()

[{'model': 'prithivida/Splade_PP_en_v1',
  'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1', 'url': None},
  'model_file': 'model.onnx',
  'description': 'Independent Implementation of SPLADE++ Model for English.',
  'license': 'apache-2.0',
  'size_in_GB': 0.532,
  'additional_files': [],
  'requires_idf': None,
  'vocab_size': 30522},
 {'model': 'prithvida/Splade_PP_en_v1',
  'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1', 'url': None},
  'model_file': 'model.onnx',
  'description': 'Independent Implementation of SPLADE++ Model for English.',
  'license': 'apache-2.0',
  'size_in_GB': 0.532,
  'additional_files': [],
  'requires_idf': None,
  'vocab_size': 30522},
 {'model': 'Qdrant/bm42-all-minilm-l6-v2-attentions',
  'sources': {'hf': 'Qdrant/all_miniLM_L6_v2_with_attentions', 'url': None},
  'model_file': 'model.onnx',
  'description': 'Light sparse embedding model, which assigns an importance score to each token in the text',
  'license': 'apache-2.0',
  'size_in_GB': 0.09,
  'additional_f

In [14]:
uniprot_query = """SELECT DISTINCT ?class ?label ?comment
WHERE {
  ?class a rdfs:Class .
  OPTIONAL { ?class rdfs:label ?label }
  OPTIONAL { ?class rdfs:comment ?comment }
}
ORDER BY ?class
"""

In [15]:
sparql.setQuery(uniprot_query)  # Set the query
results = sparql.query().convert()  # Execute the query and convert results


# Create DataFrame from results
classes_df = pd.DataFrame(results["results"]["bindings"])
print(f"Found {len(classes_df)} classes")
classes_df.head()

Found 206561 classes


Unnamed: 0,comment,label,class
0,"{'type': 'literal', 'value': 'Published in: Fr...","{'type': 'literal', 'value': 'pentanamide + H2...","{'type': 'uri', 'value': 'http://rdf.rhea-db.o..."
1,,,"{'type': 'uri', 'value': 'http://rdf.rhea-db.o..."
2,,,"{'type': 'uri', 'value': 'http://rdf.rhea-db.o..."
3,"{'type': 'literal', 'value': 'Published in: Fr...","{'type': 'literal', 'value': 'pentanamide + H2...","{'type': 'uri', 'value': 'http://rdf.rhea-db.o..."
4,"{'type': 'literal', 'value': 'Published in: Fr...","{'type': 'literal', 'value': 'pentanoate + NH4...","{'type': 'uri', 'value': 'http://rdf.rhea-db.o..."


In [28]:
print(results["results"]["bindings"][0])

#print(results["results"]["bindings"][0]["comment"]["value"])
# TODO: - filter out labels that are Nan


{'comment': {'type': 'literal', 'value': 'Published in:\nFriedich, C.G. and Mitrenga, G. \nUtilization of aliphatic amides and formation of two different amidases by <ital>Alcaligenes eutrophus</ital>.\n<ital>J. Gen. Microbiol.</ital> 125 (1981) 367–374.'}, 'label': {'type': 'literal', 'value': 'pentanamide + H2O = pentanoate + NH4(+)'}, 'class': {'type': 'uri', 'value': 'http://rdf.rhea-db.org/10000'}}


In [31]:
from langchain.schema import Document
from qdrant_client.http import models
import pandas as pd
import certifi
import os
import uuid
from typing import List, Dict, Any

def process_results(results: Dict[str, Any], filter_empty: bool = True) -> List[Document]:
    """
    Process SPARQL results into document format.
    
    Args:
        results: SPARQL query results
        filter_empty: Whether to filter out entries with both empty labels and comments
        
    Returns:
        List of processed documents
    """
    logger.info("Processing SPARQL results")
    
    # Create DataFrame for easier processing
    df = pd.DataFrame(results["results"]["bindings"])
    
    if filter_empty:
        total_count = len(df)
        
        has_label = df['label'].notna()
        has_comment = df['comment'].notna()
        keep_mask = has_label | has_comment
        
        df = df[keep_mask]
        
        filtered_count = len(df)
        logger.info(f"Filtered {total_count - filtered_count} entries with both empty labels and comments. Kept {filtered_count} entries.")
    
    documents = []
    
    for _, item in df.iterrows():
        
        # Safely extract values, handling NaN values
        uri = ''
        if pd.notna(item.get('class')):
            uri = item['class'].get('value', '')
            
        label = ''
        if pd.notna(item.get('label')):
            label = item['label'].get('value', '')
            
        comment = ''
        if pd.notna(item.get('comment')):
            comment = item['comment'].get('value', '')
        
        # Create a combined text representation
        parts = []
        if label:
            parts.append(f"Label: {label}")
        if comment:
            parts.append(f"Description: {comment}")
            
        # Always include the URI
        uri_name = uri.split('/')[-1] if '/' in uri else uri
        parts.append(f"URI: {uri_name}")
        
        content = " ".join(parts)
        
        doc = Document(
            page_content=content,
            metadata={
                "uri": uri,
                "original_label": label,
                "original_comment": comment,
                "type": "class"
            }
        )
        documents.append(doc)
    
    logger.info(f"Converted {len(documents)} classes to documents")
    return documents

In [32]:
documents = process_results(results = results, filter_empty=True)

2025-03-02 11:35:51,813 - __main__ - INFO - Processing SPARQL results
2025-03-02 11:35:51,992 - __main__ - INFO - Filtered 135786 entries with both empty labels and comments. Kept 70775 entries.
2025-03-02 11:35:53,735 - __main__ - INFO - Converted 70775 classes to documents


In [10]:
#client = QdrantClient(url="http://localhost:6333")

client = QdrantClient(host="localhost", grpc_port=6334, prefer_grpc=True)

client.set_model("sentence-transformers/all-MiniLM-L6-v2")
client.set_sparse_model("prithivida/Splade_PP_en_v1")


In [None]:
# if client.collection_exists(collection_name):
#    print("Collection already exists")
#    client.delete_collection(collection_name)

client.create_collection(
    collection_name=collection_name,
    vectors_config=client.get_fastembed_vector_params(),
    # comment this line to use dense vectors only
    sparse_vectors_config=client.get_fastembed_sparse_vector_params(),  
)

collection_info = client.get_collection(collection_name=collection_name)
print("Available vectors:", collection_info.config.params.vectors.keys())

Collection already exists
Available vectors: dict_keys(['fast-all-minilm-l6-v2'])


In [12]:

vectordb = QdrantVectorStore(
        client=client,
        collection_name=collection_name,
        embedding=FastEmbedEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
        sparse_embedding=FastEmbedSparse(model_name="prithivida/Splade_PP_en_v1"),
        vector_name="fast-all-minilm-l6-v2",
        sparse_vector_name="fast-sparse-splade_pp_en_v1",
        retrieval_mode=RetrievalMode.HYBRID,

)

In [14]:

start_time = time.time()  # Record start time

vectordb.add_documents(documents, batch_size=100)

end_time = time.time()  # Record end time

# Calculate duration
duration = end_time - start_time
print(f"Added {len(documents)} documents to Qdrant in {duration:.2f} seconds")

Added 206561 documents to Qdrant in 20516.31 seconds
