In [4]:
import re
import chromadb
from typing import Dict, Optional


In [5]:
chroma_client = chromadb.PersistentClient('embedding')
collection = chroma_client.get_collection(name='book_by_page')

In [6]:
user_query = 'Explain chapter 3'

In [30]:
def parse_query_filters(query: str) -> Optional[Dict]:
    """Extract metadata filters with Chroma-compatible syntax"""
    filters = []
    
    # Chapter number (implicit $eq)
    chapter_match = re.search(r'(chapter|ch)\s*(\d+)', query, re.IGNORECASE)
    if chapter_match:
        filters.append({"chapter_number": chapter_match.group(2)})  # No $eq
    
    # Page number (implicit $eq)
    page_match = re.search(r'(page|pg)\s*(\d+)', query, re.IGNORECASE)
    if page_match:
        filters.append({"page_number": page_match.group(2)})  # No $eq
    
    # Image path ($ne operator)
    image_match = re.search(r'\b(image|figure|diagram|illustration)\b', query, re.IGNORECASE)
    if image_match:
        filters.append({"image_path": {"$ne": None}})
    
    # Structure for Chroma
    if not filters:
        return None
    elif len(filters) == 1:
        return filters[0]
    else:
        return {"$and": filters}  # Chroma accepts this format

In [31]:
query_filters = parse_query_filters("In chapter 5's page 82 figure, what's shown?")
query_filters

{'$and': [{'chapter_number': '5'},
  {'page_number': '82'},
  {'image_path': {'$ne': None}}]}

In [32]:
query_kwargs = {
    "query_texts": ["In chapter 5's page 82 figure, what's shown?"],
    "n_results": 5,
    "include": ["metadatas", "documents"]
}

# Add where clause only if filters exist
if query_filters:
    query_kwargs["where"] = query_filters
    
query_kwargs

{'query_texts': ["In chapter 5's page 82 figure, what's shown?"],
 'n_results': 5,
 'include': ['metadatas', 'documents'],
 'where': {'$and': [{'chapter_number': '5'},
   {'page_number': '82'},
   {'image_path': {'$ne': None}}]}}

In [39]:
results = collection.query(
    query_texts=["Your question"],
    where={
        "$and": [
            {"chapter_number": {"$eq": "5"}},
            {"page_number": {"$eq": "82"}},
            {"image_path": {"$ne": None}}
        ]
    },
    n_results=5,
    include=["metadatas", "documents"]
)

ValueError: Expected where operand value to be a str, int, float, or list of those type, got None in query.

In [8]:
def format_context_entry(chunk: str, metadata: Dict) -> str:
    """Format individual context entry with metadata"""
    context_str = f"Chapter {metadata['chapter_number']}: '{metadata['chapter_name']}' "
    context_str += f"(Page {metadata['page_number']}):\n{chunk}\n"
    if metadata.get("image_path"):
        context_str += f"[Relevant image available: {metadata['image_path']}]\n"
    return context_str

In [9]:
def build_system_message(query: str, filters: Dict) -> str:
    """Create dynamic system message based on query context"""
    base_message = (
        "You are a book expert answering questions using precise text references. "
        "Always cite sources using chapter name and page number. "
    )
    
    special_instructions = []
    
    if filters:
        if "chapter_number" in filters:
            special_instructions.append(
                f"Focus on Chapter {filters.get('chapter_number')} "
                f"({'specifically requested' if 'chapter_number' in filters else 'mentioned'})"
            )
        if "page_number" in filters:
            special_instructions.append(
                f"Pay special attention to Page {filters.get('page_number')}"
            )
        if "image_path" in filters:
            special_instructions.append(
                "Include references to figures/diagrams where available"
            )
    
    if re.search(r'\b(image|figure|diagram)\b', query, re.IGNORECASE):
        special_instructions.append(
            "When mentioning images, describe their content based on surrounding text "
            "and note their availability"
        )
    
    return base_message + " ".join(special_instructions)

In [11]:
def generate_llm_prompt(
    user_query: str,
    collection,
    num_results: int = 5
) -> str:
    """
    Enhanced version with specific chapter/page/image handling
    """
    # Parse query for filters
    query_filters = parse_query_filters(user_query)
    
    # Query Chroma DB with metadata filters
    results = collection.query(
        query_texts=[user_query],
        n_results=num_results,
        where=query_filters or None,
        include=["metadatas", "documents"]
    )
    
    # Extract and format context
    chunks = results["documents"][0]
    metadatas = results["metadatas"][0]
    
    context_parts = []
    for chunk, metadata in zip(chunks, metadatas):
        context_str = format_context_entry(chunk, metadata)
        context_parts.append(context_str)
    
    # Create enhanced system message
    system_message = build_system_message(user_query, query_filters)
    
    context = '\n\n'.join(context_parts)
    
    # Construct final prompt
    prompt = f"""SYSTEM: {system_message}

CONTEXT:
{context}

QUESTION: {user_query}

ANSWER:"""
    
    return prompt

In [12]:
prompt = generate_llm_prompt(
    "In chapter 5's page 82 figure, what's shown?",
    collection
)

ValueError: Expected where operand value to be a str, int, float, or list of those type, got None in query.

In [15]:
chapter_number = None
page_number = None

In [16]:
chapter_match = re.search(r'chapter\s+(\d+)', user_query, re.IGNORECASE)
if chapter_match:
    chapter_number = int(chapter_match.group(1))

In [17]:
chapter_number

3

In [18]:
page_match = re.search(r'page\s+(\d+)', user_query, re.IGNORECASE)
if page_match:
    page_number = int(page_match.group(1))

In [19]:
page_match

In [20]:
filters = {}
if chapter_number is not None:
    filters["chapter_number"] = {"$eq": chapter_number}
if page_number is not None:
    filters["page_number"] = {"$eq": page_number}

In [21]:
filters

{'chapter_number': {'$eq': 3}}

In [22]:
results = collection.query(
    query_texts=[user_query],
    n_results=5,  # Adjust the number of results as needed
    where=filters if filters else None
)

/home/siddhant/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [10:08<00:00, 137kiB/s]   


In [23]:
results

{'ids': [['81e858b1-b792-4bd6-8818-bfb6b6d97038',
   '14097daa-9759-4dda-82bf-b13dbbf0aa5c',
   '7df4e07c-7362-423d-8c95-cf195141e986',
   '68b3f8ab-4f69-4476-9c46-a417dc77a7d2',
   'da0a19ab-7c2e-4f19-ac55-b2d4a518f52d']],
 'embeddings': None,
 'documents': [['CHAPTER 3: WAYS OF KNOWING  /',
   'CHAPTER 3: WAYS OF KNOWING  /',
   'CHAPTER 3: WAYS OF KNOWING  /',
   'CHAPTER 3: WAYS OF KNOWING  /',
   'PART 1: LOOKING BACK  /']],
 'uris': None,
 'data': None,
 'metadatas': [[{'chapter_name': 'ways of knowing ',
    'chapter_number': 3,
    'page': '41'},
   {'chapter_name': 'ways of knowing ', 'chapter_number': 3, 'page': '39'},
   {'chapter_name': 'ways of knowing ', 'chapter_number': 3, 'page': '37'},
   {'chapter_name': 'ways of knowing ', 'chapter_number': 3, 'page': '43'},
   {'chapter_name': 'ways of knowing ', 'chapter_number': 3, 'page': '49'}]],
 'distances': [[0.8369075655937195,
   0.8369075655937195,
   0.8369075655937195,
   0.8369075655937195,
   1.105747103691101]],
 'in

In [34]:
import re
from langchain.embeddings import HuggingFaceEmbeddings

In [35]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [36]:
def search_vector_database(query, vector_store, embedding_model):
    """
    Search the vector database for entries relevant to the user query.

    Args:
        query (str): The user's search query.
        vector_store: The vector database instance.
        embedding_model: The model used to generate embeddings.

    Returns:
        list: A list of relevant entries sorted by relevance.
    """
    import re

    # Initialize filters
    filters = {}

    # Extract page number if mentioned in the query
    page_match = re.search(r'\bpage\s+(\d+)\b', query, re.IGNORECASE)
    if page_match:
        filters['page'] = int(page_match.group(1))

    # Extract chapter number if mentioned in the query
    chapter_match = re.search(r'\bchapter\s+(\d+)\b', query, re.IGNORECASE)
    if chapter_match:
        filters['chapter_number'] = int(chapter_match.group(1))

    # Generate embedding for the query
    query_embedding = embedding_model.encode(query)

    # Perform the search with filters
    results = vector_store.query(query_embedding, filters=filters, top_k=10)

    return results

In [37]:
query = "In chapter 5's page 82 figure, what's shown?"
res = search_vector_database(query=query, vector_store=collection, embedding_model=embeddings)

AttributeError: 'HuggingFaceEmbeddings' object has no attribute 'encode'