In [1]:
import re
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

In [2]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [13]:
vector_store = Chroma(
    persist_directory="embedding",
    collection_name="book_by_page",
    embedding_function=embeddings
)

In [14]:
# 3. Helper function to extract metadata filters from the query text
def extract_filters(query):
    filters = {}
    # Extract a page number if present (e.g., "page 15")
    page_match = re.search(r'page\s*(\d+)', query, re.IGNORECASE)
    # Extract a chapter number if present (e.g., "Chapter 2")
    chapter_match = re.search(r'chapter\s*(\d+)', query, re.IGNORECASE)
    # Extract a chapter name if mentioned with a syntax like "chapter name: Introduction"
    chapter_name_match = re.search(r'chapter\s*name\s*:\s*([\w\s]+)', query, re.IGNORECASE)
    
    if page_match:
        filters['page'] = int(page_match.group(1))
    if chapter_match:
        filters['chapter_number'] = int(chapter_match.group(1))
    if chapter_name_match:
        filters['chapter_name'] = chapter_name_match.group(1).strip()
    
    return filters

In [15]:
user_query = "Find content from Chapter 2 page 15 about background information."

In [17]:
filters = extract_filters(user_query)

In [18]:
filters

{'page': 15, 'chapter_number': 2}

In [24]:
def build_chroma_filter(filters):
    if not filters:
        return None
    if len(filters) == 1:
        return filters
    # Combine multiple filter conditions using "$and"
    return {"$and": [{k: v} for k, v in filters.items()]}

In [25]:
chroma_filter = build_chroma_filter(filters)


In [26]:
chroma_filter

{'$and': [{'page': 15}, {'chapter_number': 2}]}

In [28]:
res = vector_store.similarity_search(query=user_query, k=10, filter=chroma_filter)

In [23]:
res

[Document(metadata={'chapter_name': 'a broad overview ', 'chapter_number': 2, 'image_paths': 'images/page_25_img_0.png', 'page': '25'}, page_content='CHAPTER 2: A BROAD OVERVIEW  /'),
 Document(metadata={'chapter_name': 'a broad overview ', 'chapter_number': 2, 'image_paths': 'images/page_29_img_0.png', 'page': '29'}, page_content='CHAPTER 2: A BROAD OVERVIEW  /'),
 Document(metadata={'chapter_name': 'a broad overview ', 'chapter_number': 2, 'image_paths': 'images/page_31_img_0.png', 'page': '31'}, page_content='CHAPTER 2: A BROAD OVERVIEW  /'),
 Document(metadata={'chapter_name': 'a broad overview ', 'chapter_number': 2, 'image_paths': 'images/page_33_img_0.png', 'page': '33'}, page_content='CHAPTER 2: A BROAD OVERVIEW  /'),
 Document(metadata={'chapter_name': 'a broad overview ', 'chapter_number': 2, 'image_paths': 'images/page_27_img_0.png', 'page': '27'}, page_content='CHAPTER 2: A BROAD OVERVIEW  /     \n     \n27'),
 Document(metadata={'chapter_name': 'a broad overview ', 'chapte

In [29]:
def search_vectorstore(query, k=5):
    # Extract metadata filters from the query
    filters = extract_filters(query)
    chroma_filter = build_chroma_filter(filters)
    
    # Perform the semantic search with the filter applied
    results = vector_store.similarity_search(query, k=10, filter=chroma_filter)
    
    # Optional: Further manual filtering if needed
    if filters:
        filtered_results = []
        for doc in results:
            match = True
            for key, value in filters.items():
                if doc.metadata.get(key) != value:
                    match = False
                    break
            if match:
                filtered_results.append(doc)
        results = filtered_results[:k]
    else:
        results = results[:k]
    
    return results

In [44]:
user_query = "Find content from Chapter 2"


In [32]:
    
    # Search the vector store using the provided query
search_results = search_vectorstore(user_query)

In [33]:
search_results

[Document(metadata={'chapter_name': 'a broad overview ', 'chapter_number': 2, 'image_paths': 'images/page_25_img_0.png', 'page': '25'}, page_content='CHAPTER 2: A BROAD OVERVIEW  /'),
 Document(metadata={'chapter_name': 'a broad overview ', 'chapter_number': 2, 'image_paths': 'images/page_29_img_0.png', 'page': '29'}, page_content='CHAPTER 2: A BROAD OVERVIEW  /'),
 Document(metadata={'chapter_name': 'a broad overview ', 'chapter_number': 2, 'image_paths': 'images/page_31_img_0.png', 'page': '31'}, page_content='CHAPTER 2: A BROAD OVERVIEW  /'),
 Document(metadata={'chapter_name': 'a broad overview ', 'chapter_number': 2, 'image_paths': 'images/page_33_img_0.png', 'page': '33'}, page_content='CHAPTER 2: A BROAD OVERVIEW  /'),
 Document(metadata={'chapter_name': 'a broad overview ', 'chapter_number': 2, 'image_paths': 'images/page_34_img_0.png', 'page': '34'}, page_content='It doesn’t matter if you can’t come up with satisfactory answers to these questions. What is important is that you

In [48]:
combined_context = f"User query: {user_query}\n\nSearch results:\n"

In [49]:
combined_context

'User query: Find content from Chapter 2\n\nSearch results:\n'

In [50]:
for doc in search_results:
    combined_context += f"Text: {doc.page_content}\n\n"

In [51]:
combined_context

'User query: Find content from Chapter 2\n\nSearch results:\nText: CHAPTER 2: A BROAD OVERVIEW  /\n\nText: CHAPTER 2: A BROAD OVERVIEW  /\n\nText: CHAPTER 2: A BROAD OVERVIEW  /\n\nText: CHAPTER 2: A BROAD OVERVIEW  /\n\nText: It doesn’t matter if you can’t come up with satisfactory answers to these questions. What is important is that you engage with the questions, and like our characters in these stories, try to answer them as best as you can for now, and refine them as you go along. This type of keen and sustained engagement will strengthen your intellectual muscles, and after a few months, you will discover that you have a stronger, faster, and more agile mind. Do make a note of your answers so that you can re-evaluate them once you’ve come to the end of the book! By the way, you may want to come up with a research plan to test the tulsi tea hypothesis; and also, to find an answer to the ‘sum-of-angles’ question that Samira raised.  One thing that you will notice about inquiry is t

In [54]:
images = []
for doc in search_results:
    
    if doc.metadata.get('image_paths'):
        images.append(doc.metadata.get('image_paths'))

In [55]:
images

['images/page_25_img_0.png',
 'images/page_29_img_0.png',
 'images/page_31_img_0.png',
 'images/page_33_img_0.png',
 'images/page_34_img_0.png']