In [20]:
%pip install psycopg2 numpy sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
Collecting tqdm (from sentence-transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.27.0-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.4.5-cp3

In [21]:
import psycopg2
from psycopg2.extras import execute_values, Json
import numpy as np
from typing import List, Tuple
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [22]:

class PostgresVectorStore:
    def __init__(self, connection_string: str, model_name: str = "all-MiniLM-L6-v2"):
        """
        Initialize the vector store with a PostgreSQL connection string.
        
        Args:
            connection_string: PostgreSQL connection string
            model_name: Name of the sentence-transformers model to use
        """
        self.conn_string = connection_string
        self.model = SentenceTransformer(model_name)
    
    def add_texts(self, texts: List[str], metadatas: List[dict] = None):
        """
        Add texts to the vector store.
        
        Args:
            texts: List of text strings to embed and store
            metadatas: Optional list of metadata dictionaries
        """
        if metadatas is None:
            metadatas = [{} for _ in texts]
            
        # Generate embeddings for all texts
        embeddings = self.model.encode(texts)
        
        # Create document tuples
        documents = [
            (text, embedding.tolist(), metadata) 
            for text, embedding, metadata in zip(texts, embeddings, metadatas)
        ]
        
        with psycopg2.connect(self.conn_string) as conn:
            with conn.cursor() as cur:
                values = [(doc[0], doc[1], Json(doc[2])) for doc in documents]
                execute_values(cur, """
                    INSERT INTO document_embeddings (content, embedding, metadata)
                    VALUES %s
                """, values)
    
    def similarity_search(
        self, 
        query: str, 
        limit: int = 5
    ) -> List[Tuple[str, float, dict]]:
        """
        Perform similarity search using cosine similarity.
        
        Args:
            query: Text query to search for
            limit: Maximum number of results to return
            
        Returns:
            List of tuples containing (content, similarity_score, metadata)
        """
        # Generate embedding for the query
        query_embedding = self.model.encode(query).tolist()
        
        with psycopg2.connect(self.conn_string) as conn:
            with conn.cursor() as cur:
                cur.execute("""
                    SELECT 
                        content,
                        1 - (embedding <=> %s::vector) as similarity,
                        metadata
                    FROM document_embeddings
                    ORDER BY embedding <=> %s::vector
                    LIMIT %s
                """, (query_embedding, query_embedding, limit))
                
                return cur.fetchall()

In [23]:
# Example usage
if __name__ == "__main__":
    # Initialize vector store with Docker PostgreSQL connection
    store = PostgresVectorStore(
        "postgresql://vectordb:vectorpass@localhost:5432/vectordb"
    )
    
    # Example texts and metadata
    texts = [
        "The quick brown fox jumps over the lazy dog",
        "Machine learning is a subset of artificial intelligence",
        "Python is a versatile programming language",
        "Natural language processing helps computers understand human language",
        "Vector databases are optimized for similarity search"
    ]
    
    metadatas = [
        {"source": "sample1", "category": "pangram"},
        {"source": "sample2", "category": "technology"},
        {"source": "sample3", "category": "programming"},
        {"source": "sample4", "category": "nlp"},
        {"source": "sample5", "category": "databases"}
    ]
    
    # Add documents
    store.add_texts(texts, metadatas)
    
    # Perform similarity search
    query = "Tell me about AI and machine learning"
    results = store.similarity_search(query, limit=2)
    
    # Print results
    for content, similarity, metadata in results:
        print(f"Content: {content}")
        print(f"Similarity: {similarity:.4f}")
        print(f"Metadata: {metadata}")
        print("---")

Content: Machine learning is a subset of artificial intelligence
Similarity: 0.7109
Metadata: {'source': 'sample2', 'category': 'technology'}
---
Content: Natural language processing helps computers understand human language
Similarity: 0.3885
Metadata: {'source': 'sample4', 'category': 'nlp'}
---
