In [None]:
# Hybrid Multimodal Search with Gemini, Qdrant, and BM25
# A comprehensive system for semantic and keyword-based document search

# ==========================================
# 1. SETUP AND AUTHENTICATION
# ==========================================

# Install required libraries
!pip install qdrant-client google-generativeai rank-bm25 PyMuPDF Pillow numpy pandas sentence-transformers

import time
from typing import List
import os
import io
import base64
import json
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass
import numpy as np
from PIL import Image

# Core libraries
import fitz  # PyMuPDF
import google.generativeai as genai
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
from rank_bm25 import BM25Okapi

# NLTK for stop word removal
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
# Load English stop words
stop_words = set(stopwords.words('english'))
# Google Colab specific
from google.colab import files, userdata
import ipywidgets as widgets
from IPython.display import display, HTML, Image as IPImage

print("‚úÖ All libraries installed successfully!")

# ==========================================
# AUTHENTICATION SETUP
# ==========================================

# Load API key from Colab secrets
try:
    GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
    genai.configure(api_key=GEMINI_API_KEY)
    print("‚úÖ Gemini API configured successfully!")
except Exception as e:
    print("‚ùå Please add your GEMINI_API_KEY to Colab secrets")
    print("Go to the key icon on the left sidebar and add GEMINI_API_KEY")
    raise e

# Function to create Qdrant client
def create_qdrant_client():
    """Initializes and returns a Qdrant client."""
    try:
        # Initialize Qdrant client (using in-memory for demo - replace with your instance)
        # qdrant_client = QdrantClient(":memory.")  # For production: QdrantClient
        qdrant_url = userdata.get('QDRANT_CLOUD_URL')
        QDRANT_API_KEY = userdata.get('QDRANT_API')
        qdrant_client = QdrantClient(url=qdrant_url, api_key=QDRANT_API_KEY)
        print("‚úÖ Qdrant client initialized successfully!")
        return qdrant_client
    except Exception as e:
        print(f"‚ùå Error initializing Qdrant client: {e}")
        return None


# Initialize NLTK stop words
try:
    stop_words = set(stopwords.words('english'))
    print("‚úÖ NLTK stop words loaded successfully!")
except LookupError:
    print("‚ùå NLTK data not found. Please run the cell to download 'punkt' and 'stopwords'.")
    stop_words = set() # Initialize as empty set to avoid errors later
# import json
# import os
# from qdrant_client import QdrantClient
# from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
# from typing import List, Dict, Any, Optional, Tuple
# from dataclasses import dataclass
# import numpy as np
# from rank_bm25 import BM25Okapi
# import nltk
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# import io
# from PIL import Image
# import fitz
# from IPython.display import display, Image as IPImage

# Assuming DocumentChunk is defined elsewhere and accessible
@dataclass
class DocumentChunk:
    id: str
    text: str
    page_number: int
    source_type: str
    image_description: Optional[str] = None
    image_data: Optional[bytes] = None
    metadata: Optional[Dict] = None

# Assuming SearchResult is defined elsewhere and accessible
@dataclass
class SearchResult:
    chunk: DocumentChunk
    score: float
    rank: int
    search_type: str  # 'semantic', 'keyword', or 'hybrid'

# Assuming CONFIG is defined elsewhere and accessible
CONFIG = {
    "collection_name": "calcom_help_docs", # "hybrid_search_collection_test_bm25",
    "vector_dimension": 768,  # gemini-embedding-001 dimension
    "chunk_size": 1000,
    "chunk_overlap": 200,
    "top_k_results": 5, # Changed to 5
    "rrf_k": 60  # RRF parameter
}

# Assuming stop_words is defined elsewhere and accessible
try:
    stop_words = set(stopwords.words('english'))
except LookupError:
    print("‚ùå NLTK data not found. Please run the cell to download 'punkt' and 'stopwords'.")
    stop_words = set()

# Assuming get_gemini_embedding is defined elsewhere and accessible
def get_gemini_embedding_with_retry(text: str, task_type: str = "RETRIEVAL_DOCUMENT") -> List[float]:
    """Get embedding from Gemini API with retry logic for rate-limiting."""
    max_retries = 5
    delay = 1 # Initial delay in seconds

    for i in range(max_retries):
        try:
            result = genai.embed_content(
                model="models/embedding-001",
                content=text,
                task_type=task_type
            )
            return result['embedding']
        except Exception as e:
            # Convert the exception to a string for case-insensitive checking
            error_message = str(e).lower()

            # Check for specific rate-limiting or quota errors
            if "quota" in error_message or "too many requests" in error_message:
                print(f"Rate limited. Retrying in {delay} seconds... (Attempt {i+1}/{max_retries})")
                time.sleep(delay)
                delay *= 2  # Exponential backoff
            else:
                # For other errors, print the error and exit the loop
                print(f"Error getting embedding: {e}")
                return None

    # If all retries fail
    print(f"Failed to get embedding after {max_retries} attempts.")
    return None

# Assuming reciprocal_rank_fusion is defined elsewhere and accessible
def reciprocal_rank_fusion(semantic_results: List[Tuple], keyword_results: List[Tuple], k: int = 60) -> List[Tuple]:
    """Combine semantic and keyword search results using Reciprocal Rank Fusion."""
    # Create score dictionaries
    semantic_scores = {item[0]: 1.0 / (k + rank + 1) for rank, item in enumerate(semantic_results)}
    keyword_scores = {item[0]: 1.0 / (k + rank + 1) for rank, item in enumerate(keyword_results)}

    # Combine scores
    all_ids = set(semantic_scores.keys()) | set(keyword_scores.keys())
    fused_scores = {}

    for doc_id in all_ids:
        fused_scores[doc_id] = semantic_scores.get(doc_id, 0) + keyword_scores.get(doc_id, 0)

    # Sort by fused score
    sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_results

# Assuming extract_and_chunk_text is defined elsewhere and accessible
def extract_and_chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
    """Split text into overlapping chunks."""
    if len(text) <= chunk_size:
        return [text]

    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]

        # Try to break at sentence boundary
        if end < len(text):
            last_period = chunk.rfind('.')
            last_newline = chunk.rfind('\n')
            break_point = max(last_period, last_newline)
            if break_point > start + chunk_size // 2:
                chunk = text[start:break_point + 1]
                end = break_point + 1

        chunks.append(chunk.strip())
        start = end - overlap

        if start >= len(text):
            break

        if start >= len(text):
            break


    return chunks


class DocumentIngester:
    def __init__(self, qdrant_client: QdrantClient):
        self.qdrant_client = qdrant_client
        self.chunks = []
        self.bm25_corpus = []
        self.bm25_index = None
        self.bm25_corpus_file = "bm25_calcom_corpus.json"

        # Delete collection if it exists (keep for fresh start in demo)
        # try:
        #     self.qdrant_client.delete_collection(collection_name=CONFIG["collection_name"])
        #     print(f"‚úÖ Deleted existing collection: {CONFIG['collection_name']}")
        # except Exception as e:
        #     print(f"Collection might not exist or error deleting: {e}")


        # Create collection if it doesn't exist
        try:
            self.qdrant_client.get_collection(collection_name=CONFIG["collection_name"])
            print(f"‚úÖ Collection '{CONFIG['collection_name']}' already exists.")
        except Exception: # Collection does not exist
            try:
                self.qdrant_client.create_collection(
                    collection_name=CONFIG["collection_name"],
                    vectors_config=VectorParams(
                        size=CONFIG["vector_dimension"],
                        distance=Distance.COSINE
                    )
                )
                print(f"‚úÖ Created collection: {CONFIG['collection_name']}")
            except Exception as e:
                print(f"‚ùå Error creating collection: {e}")

        # Attempt to load BM25 corpus
        self.load_bm25_corpus(self.bm25_corpus_file)

        # Rebuild BM25 index if corpus was loaded
        if self.bm25_corpus:
            try:
                self.bm25_index = BM25Okapi(self.bm25_corpus)
                print("‚úÖ BM25 index rebuilt from loaded corpus.")
            except Exception as e:
                print(f"‚ùå Error rebuilding BM25 index from loaded corpus: {e}")
                self.bm25_index = None

    def describe_image_with_gemini(self, image_bytes: bytes) -> str:
        """Generate image description using Gemini Vision."""
        try:
            # Convert bytes to PIL Image
            image = Image.open(io.BytesIO(image_bytes))

            # Initialize Gemini Vision model
            # Ensure you have initialized genai with your API key before calling this function
            model = genai.GenerativeModel('gemini-1.5-flash')

            # prompt = """Describe this image in detail. Focus on:
            # 1. Main objects, people, or elements, or technical diagrams
            # 2. Text content (if any)
            # 3. Charts, diagrams, or technical content
            # 4. Spatial relationships and layout
            # 5. Any relevant contextual information

            # Provide a comprehensive description that would help in document search."""
            prompt = """"Describe the technical diagrams and tables found in the provided document. For each description, follow these steps:

            Purpose: Begin by stating the primary function or purpose of the diagram or table (e.g., 'This is a wiring diagram showing the electrical connections,' or 'This table provides the technical specifications for the product').

            Components: Provide a detailed breakdown of the visual elements.

            For diagrams: List and explain each labeled component, symbol, or annotation. Describe the relationships or processes shown by arrows or other visual cues.

            For tables: Identify and explain the meaning of each column and row. Highlight the key data points, including any values and units of measurement.

            Synthesis: Conclude with a summary of the most important information presented, explaining how the different parts of the diagram or table work together to convey a complete message."""

            response = model.generate_content([prompt, image])
            return response.text
        except Exception as e:
            print(f"Error describing image: {e}")
            return "Image description unavailable"

    def extract_pdf_content(self, pdf_path: str) -> List[DocumentChunk]:
        """Extract text and images from PDF."""
        doc = fitz.open(pdf_path)
        all_chunks = []

        print(f"üìÑ Processing PDF with {min(len(doc), 10)} pages (limited to first 10 for trial)...") # Updated print statement

        # Process only the first 10 pages
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            print(f"Processing page {page_num + 1}...")

            # Extract text
            text = page.get_text()
            if text.strip():
                print(f"Extracted text from page {page_num + 1}. Text length: {len(text)}")
                # Chunk the text
                text_chunks = extract_and_chunk_text(text, CONFIG["chunk_size"], CONFIG["chunk_overlap"])
                print(f"Split into {len(text_chunks)} text chunks on page {page_num + 1}")
                print(f"Debug: text_chunks content for page {page_num + 1}: {text_chunks}, Type: {type(text_chunks)}") # Added debug log


                if text_chunks: # Add check here
                    for i, chunk_text in enumerate(text_chunks):
                        print(f"Debug: Processing text chunk {i} on page {page_num + 1}: {chunk_text[:50]}...") # Added debug log
                        chunk_id = f"page_{page_num}_chunk_{i}"
                        chunk = DocumentChunk(
                            id=chunk_id,
                            text=chunk_text,
                            page_number=page_num,
                            source_type="document",
                            metadata={"pdf_path": pdf_path}
                        )
                        all_chunks.append(chunk)
                else:
                    print(f"‚ö†Ô∏è  No text chunks generated for page {page_num + 1}") # Added log


            # Extract images
            image_list = page.get_images()
            print(f"Found {len(image_list)} images on page {page_num + 1}")
            for img_index, img in enumerate(image_list):
                try:
                    # Get image data
                    xref = img[0]
                    pix = fitz.Pixmap(doc, xref)

                    # Convert pixmap to RGB if it's not already in a compatible format
                    if pix.n > 3: # CMYK or other formats
                         pix = fitz.Pixmap(fitz.csRGB, pix)
                    elif pix.n == 1: # Grayscale
                         pix = fitz.Pixmap(fitz.csRGB, pix) # Convert grayscale to RGB


                    img_data = pix.tobytes("png")

                    # Generate description
                    print(f"Describing image {img_index} on page {page_num + 1}...")
                    description = self.describe_image_with_gemini(img_data) # Use self.describe_image_with_gemini
                    print(f"Image description generated for page {page_num + 1}: {description[:50]}...")

                    # Create image chunk
                    chunk_id = f"page_{page_num}_image_{img_index}"
                    chunk = DocumentChunk(
                        id=chunk_id,
                        text=description,
                        page_number=page_num,
                        source_type="image",
                        image_description=description,
                        image_data=img_data,
                        metadata={"pdf_path": pdf_path}
                    )
                    all_chunks.append(chunk)
                    print(f"üñºÔ∏è  Processed image {img_index} on page {page_num + 1}")

                    pix = None
                except Exception as e:
                    print(f"Error processing image {img_index} on page {page_num}: {e}")

        doc.close()
        print(f"‚úÖ Extracted {len(all_chunks)} chunks from PDF")
        return all_chunks


    def ingest_chunks(self, chunks: List[DocumentChunk]):
        """Ingest chunks into vector database and BM25 index."""
        print("üîÑ Starting ingestion process...")
        print(f"Attempting to ingest {len(chunks)} chunks.")

        points = []
        newly_added_corpus = [] # Use a temporary list for current ingestion's corpus

        for i, chunk in enumerate(chunks):
            print(f"Processing chunk {i+1}/{len(chunks)} (ID: {chunk.id}, Type: {chunk.source_type})")
            # Get embedding
            embedding = get_gemini_embedding_with_retry(chunk.text, "RETRIEVAL_DOCUMENT")
            if embedding is None:
                print(f"‚ö†Ô∏è  Skipping chunk {chunk.id} - failed to get embedding")
                continue
            print(f"‚úÖ Embedding generated for chunk {chunk.id}")

            # Prepare point for Qdrant
            point = PointStruct(
                id=len(self.chunks) + len(points), # Ensure unique IDs across multiple ingestions
                vector=embedding,
                payload={
                    "chunk_id": chunk.id,
                    "text": chunk.text,
                    "page_number": chunk.page_number,
                    "source_type": chunk.source_type,
                    "image_description": chunk.image_description,
                    "has_image": chunk.image_data is not None,
                    "metadata": chunk.metadata or {}
                }
            )
            points.append(point)
            print(f"‚úÖ Point prepared for chunk {chunk.id}. Total points prepared: {len(points)}")


            # Prepare for BM25 using NLTK and stop words
            try:
                tokens = word_tokenize(chunk.text.lower())
                filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
                newly_added_corpus.append(filtered_tokens)
                print(f"‚úÖ Tokenized and filtered chunk {chunk.id} for BM25.")
            except Exception as e:
                print(f"‚ùå Error tokenizing/filtering chunk {chunk.id} for BM25: {e}")
                newly_added_corpus.append([]) # Add empty list to maintain corpus length


            # Store chunk reference
            self.chunks.extend(chunks) # Extend chunks with the new chunks
            print(f"Debug: self.chunks length after adding current chunks: {len(self.chunks)}")


        print(f"Prepared {len(points)} points for upserting to Qdrant.")
        if points:
            # Upsert to Qdrant
            try:
                response = self.qdrant_client.upsert(
                    collection_name=CONFIG["collection_name"],
                    points=points,
                    wait=True # Wait for the operation to complete
                )
                print(f"‚úÖ Qdrant upsert response: {response}")
                print(f"‚úÖ Upserted {len(points)} points to Qdrant.")
            except Exception as e:
                print(f"‚ùå Error during Qdrant upsert: {e}")

        else:
            print("‚ö†Ô∏è No points to upsert to Qdrant.")

        # Accumulate the new corpus
        self.bm25_corpus.extend(newly_added_corpus)

        print(f"Debug: Corpus content after adding newly added corpus: {self.bm25_corpus}, Type: {type(self.bm25_corpus)}") # Added debug log
        # Build BM25 index from the accumulated corpus
        if self.bm25_corpus and any(self.bm25_corpus): # Check if corpus is not empty and contains non-empty token lists
            self.bm25_index = BM25Okapi(self.bm25_corpus)
            print(f"‚úÖ BM25 index built with {len(self.bm25_corpus)} documents.")

            # Save BM25 corpus to file
            self.save_bm25_corpus() # Call the new method

        else:
            self.bm25_index = None # Ensure index is None if corpus is empty
            self.bm25_corpus = []
            print("‚ö†Ô∏è No corpus for building BM25 index.")


        print(f"‚úÖ Successfully processed {len(points)} chunks for ingestion")


    def ingest_document(self, pdf_path: str):
        """Complete ingestion pipeline for a PDF document."""
        chunks = self.extract_pdf_content(pdf_path)
        self.ingest_chunks(chunks)
        self.save_bm25_corpus() # Save the corpus after processing each document

    def load_bm25_corpus(self, file_path: str):
        """Loads the tokenized BM25 corpus from a JSON file."""
        if os.path.exists(file_path):
            try:
                with open(file_path, "r") as f:
                    loaded_corpus = json.load(f)
                    if isinstance(loaded_corpus, list):
                        self.bm25_corpus = loaded_corpus
                    else:
                        print(f"‚ö†Ô∏è Loaded BM25 corpus from {file_path} is not a list. Initializing with empty corpus.")
                        self.bm25_corpus = []
                print(f"‚úÖ BM25 corpus loaded from {file_path}")
            except Exception as e:
                print(f"‚ùå Error loading BM25 corpus from {file_path}: {e}")
                self.bm25_corpus = []
        else:
            print(f"‚ö†Ô∏è BM25 corpus file not found at {file_path}. BM25 index will be built during ingestion.")
            self.bm25_corpus = []


    def save_bm25_corpus(self):
        """Saves the tokenized BM25 corpus to a JSON file."""
        try:
            with open(self.bm25_corpus_file, "w") as f:
                json.dump(self.bm25_corpus, f)
            print(f"‚úÖ BM25 corpus saved to {self.bm25_corpus_file}")
        except Exception as e:
            print(f"‚ùå Error saving BM25 corpus: {e}")

def create_upload_interface():
    """Create file upload interface."""
    print("üìÅ Upload a PDF document to get started:")

    upload_button = widgets.FileUpload(
        accept='.pdf',
        multiple=False,
        description='Choose PDF'
    )

    def on_upload_change(change):
        if change['new']:
            # Save uploaded file
            filename = list(change['new'].keys())[0]
            content = change['new'][filename]['content']

            with open(filename, 'wb') as f:
                f.write(content)

            print(f"‚úÖ Uploaded: {filename}")
            return filename


    upload_button.observe(on_upload_change, names='value')
    display(upload_button)

    return upload_button


# ==========================================
# 7. DEMONSTRATION WORKFLOW
# ==========================================

def run_demo():
    """Run the complete demonstration."""
    print("üöÄ Starting Hybrid Multimodal Search Demo")
    print("=" * 50)
    qdrant_client = create_qdrant_client()
    # Initialize system
    ingester = DocumentIngester(qdrant_client)
    # search_engine = HybridSearchEngine(qdrant_client, ingester)

    # Create upload interface
    upload_widget = create_upload_interface()

    return ingester, upload_widget


# ==========================================
# 8. MAIN EXECUTION
# ==========================================

if __name__ == "__main__":
    # Run the demo
    ingester, upload_widget = run_demo()

    print("""
üìã INSTRUCTIONS:
1. Upload a PDF using the file picker above
2. Wait for processing to complete
3. Run searches using the search_engine.hybrid_search() function

Example usage:
search_engine.hybrid_search("your query here")
search_engine.display_results(results)
""")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


‚úÖ All libraries installed successfully!
‚úÖ Gemini API configured successfully!
‚úÖ NLTK stop words loaded successfully!
üöÄ Starting Hybrid Multimodal Search Demo
‚úÖ Qdrant client initialized successfully!
‚úÖ Collection 'calcom_help_docs' already exists.
‚ö†Ô∏è BM25 corpus file not found at bm25_calcom_corpus.json. BM25 index will be built during ingestion.
üìÅ Upload a PDF document to get started:


FileUpload(value={}, accept='.pdf', description='Choose PDF')


üìã INSTRUCTIONS:
1. Upload a PDF using the file picker above
2. Wait for processing to complete
3. Run searches using the search_engine.hybrid_search() function

Example usage:
search_engine.hybrid_search("your query here")
search_engine.display_results(results)



## Modify hybridsearchengine initialization

### Subtask:
Update the `HybridSearchEngine` class to load the BM25 corpus from the saved file during its initialization and build its own BM25 index. Remove the dependency on the `DocumentIngester` instance.


**Reasoning**:
Update the HybridSearchEngine class to load the BM25 corpus and build the index in its initialization, removing the dependency on DocumentIngester.



In [None]:
from sentence_transformers.cross_encoder import CrossEncoder

reranker_model = CrossEncoder('BAAI/bge-reranker-large')
print("‚úÖ Reranker model loaded successfully!")

‚úÖ Reranker model loaded successfully!


In [None]:
import json
import os
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
from typing import List, Dict, Any, Optional, Tuple
from dataclasses import dataclass
import numpy as np
from rank_bm25 import BM25Okapi
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import io
from PIL import Image
import fitz
from IPython.display import display, Image as IPImage

# Assuming DocumentChunk is defined elsewhere and accessible
@dataclass
class DocumentChunk:
    id: str
    text: str
    page_number: int
    source_type: str
    image_description: Optional[str] = None
    image_data: Optional[bytes] = None
    metadata: Optional[Dict] = None

# Assuming SearchResult is defined elsewhere and accessible
@dataclass
class SearchResult:
    chunk: DocumentChunk
    score: float
    rank: int
    search_type: str  # 'semantic', 'keyword', or 'hybrid'

# Assuming CONFIG is defined elsewhere and accessible
CONFIG = {
    "collection_name": "calcom_help_docs",
    "vector_dimension": 768,  # gemini-embedding-001 dimension
    "chunk_size": 1000,
    "chunk_overlap": 200,
    "top_k_results": 5, # Changed to 5
    "rrf_k": 60  # RRF parameter
}

# Assuming stop_words is defined elsewhere and accessible
try:
    stop_words = set(stopwords.words('english'))
except LookupError:
    print("‚ùå NLTK data not found. Please run the cell to download 'punkt' and 'stopwords'.")
    stop_words = set()

# Assuming get_gemini_embedding is defined elsewhere and accessible
def get_gemini_embedding(text: str, task_type: str = "RETRIEVAL_DOCUMENT") -> List[float]:
    """Get embedding from Gemini API."""
    pass

# Assuming describe_image_with_gemini is defined elsewhere and accessible
def describe_image_with_gemini(image_bytes: bytes) -> str:
    """Generate image description using Gemini Vision."""
    pass

# Assuming reciprocal_rank_fusion is defined elsewhere and accessible
def reciprocal_rank_fusion(semantic_results: List[Tuple], keyword_results: List[Tuple], k: int = 60) -> List[Tuple]:
    """Combine semantic and keyword search results using Reciprocal Rank Fusion."""
    # Create score dictionaries
    semantic_scores = {item[0]: 1.0 / (k + rank + 1) for rank, item in enumerate(semantic_results)}
    keyword_scores = {item[0]: 1.0 / (k + rank + 1) for rank, item in enumerate(keyword_results)}

    # Combine scores
    all_ids = set(semantic_scores.keys()) | set(keyword_scores.keys())
    fused_scores = {}

    for doc_id in all_ids:
        fused_scores[doc_id] = semantic_scores.get(doc_id, 0) + keyword_scores.get(doc_id, 0)

    # Sort by fused score
    sorted_results = sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_results

# Assuming extract_and_chunk_text is defined elsewhere and accessible
def extract_and_chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
    """Split text into overlapping chunks."""
    if len(text) <= chunk_size:
        return [text]

    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]

        # Try to break at sentence boundary
        if end < len(text):
            last_period = chunk.rfind('.')
            last_newline = chunk.rfind('\n')
            break_point = max(last_period, last_newline)
            if break_point > start + chunk_size // 2:
                chunk = text[start:break_point + 1]
                end = break_point + 1

        chunks.append(chunk.strip())
        start = end - overlap

        if start >= len(text):
            break

        if start >= len(text):
            break


    return chunks


# Dummy DocumentIngester for necessary type hinting and potential future use if needed elsewhere
class DocumentIngester:
    def __init__(self, qdrant_client: QdrantClient):
        self.qdrant_client = qdrant_client
        self.chunks = []
        self.bm25_corpus = []
        self.bm25_index = None
        self.bm25_corpus_file = "bm25_calcom_corpus.json"

    def extract_pdf_content(self, pdf_path: str) -> List[DocumentChunk]:
        pass

    def ingest_chunks(self, chunks: List[DocumentChunk]):
        pass

    def ingest_document(self, pdf_path: str):
        pass

    def load_bm25_corpus(self, file_path: str):
        pass

    def save_bm25_corpus(self):
        pass


class HybridSearchEngine:
    def __init__(self, qdrant_client: QdrantClient):
        self.qdrant_client = qdrant_client
        self.bm25_corpus = []
        self.bm25_index = None
        self.bm25_corpus_file = "bm25_corpus.json"
        self.chunks = [] # Need chunks to map BM25 results back to DocumentChunk objects

        # Load BM25 corpus and build index
        self._load_bm25_corpus_and_build_index()

        # Load chunks from Qdrant to map BM25 results back
        self._load_chunks_from_qdrant()


    def _load_bm25_corpus_and_build_index(self):
        """Loads the tokenized BM25 corpus from a JSON file and builds the index."""
        if os.path.exists(self.bm25_corpus_file):
            try:
                with open(self.bm25_corpus_file, "r") as f:
                    self.bm25_corpus = json.load(f)
                print(f"‚úÖ BM25 corpus loaded from {self.bm25_corpus_file}")

                if self.bm25_corpus and any(self.bm25_corpus):
                    self.bm25_index = BM25Okapi(self.bm25_corpus)
                    print("‚úÖ BM25 index built from loaded corpus.")
                    print(f"BM25 corpus size: {len(self.bm25_corpus)}") # Add logging

                else:
                    self.bm25_index = None
                    print("‚ö†Ô∏è Loaded BM25 corpus is empty or contains only empty lists. BM25 index not built.")

            except Exception as e:
                print(f"‚ùå Error loading or building BM25 index from {self.bm25_corpus_file}: {e}")
                self.bm25_corpus = []
                self.bm25_index = None
        else:
            print(f"‚ö†Ô∏è BM25 corpus file not found at {self.bm25_corpus_file}. BM25 keyword search will not be available.")
            self.bm25_corpus = []
            self.bm25_index = None

    def _load_chunks_from_qdrant(self):
        """Loads chunk information from Qdrant to map BM25 results."""
        try:
            # Retrieve all points from the collection
            scroll_result = self.qdrant_client.scroll(
                collection_name=CONFIG["collection_name"],
                limit=10000,  # Adjust limit based on expected number of chunks
                with_payload=True,
                with_vectors=False
            )
            self.chunks = []
            for record in scroll_result[0]:
                payload = record.payload
                chunk = DocumentChunk(
                    id=payload["chunk_id"],
                    text=payload["text"],
                    page_number=payload["page_number"],
                    source_type=payload["source_type"],
                    image_description=payload.get("image_description"),
                    image_data=None, # Image data is not stored in Qdrant payload in this example
                    metadata=payload.get("metadata", {})
                )
                self.chunks.append(chunk)
            print(f"‚úÖ Loaded {len(self.chunks)} chunks from Qdrant.")
            print(f"Number of chunks loaded from Qdrant: {len(self.chunks)}") # Add logging

        except Exception as e:
            print(f"‚ùå Error loading chunks from Qdrant: {e}")
            self.chunks = []


    def semantic_search(self, query: str, top_k: int = 10, source_type: Optional[str] = None) -> List[Tuple]:
        """Perform semantic search using Qdrant."""
        # Get query embedding
        query_embedding = get_gemini_embedding(query, "RETRIEVAL_QUERY")
        if query_embedding is None:
            return []

        # Prepare filter
        filter_condition = None
        if source_type:
            filter_condition = Filter(
                must=[FieldCondition(key="source_type", match=MatchValue(value=source_type))]
            )

        # Search
        print(f"Performing semantic search for query: '{query}'")
        try:
            results = self.qdrant_client.search(
                collection_name=CONFIG["collection_name"],
                query_vector=query_embedding,
                query_filter=filter_condition,
                limit=top_k
            )
            print(f"Semantic search returned {len(results)} results.")
            return [(result.payload["chunk_id"], result.score) for result in results]
        except Exception as e:
            print(f"‚ùå Error during semantic search: {e}")
            return []


    def keyword_search(self, query: str, top_k: int = 10) -> List[Tuple]:
        """Perform keyword search using BM25."""
        if self.bm25_index is None or not self.chunks:
            print("‚ö†Ô∏è BM25 index or chunks not available for keyword search.")
            return []

        print(f"Performing keyword search for query: '{query}'")
        # Use NLTK for tokenization and stop word removal for the query as well
        query_tokens = word_tokenize(query.lower())
        filtered_query_tokens = [word for word in query_tokens if word.isalnum() and word not in stop_words]

        if not filtered_query_tokens:
            print("‚ö†Ô∏è No valid tokens for keyword search.")
            return []

        scores = self.bm25_index.get_scores(filtered_query_tokens)

        # Get top results and map back to chunk IDs
        top_indices = np.argsort(scores)[::-1][:top_k]
        results = []
        for i in top_indices:
            if i < len(self.chunks): # Add bounds check
                if scores[i] > 0:
                    results.append((self.chunks[i].id, scores[i]))
            else:
                 print(f"Warning: Index {i} out of bounds for self.chunks ({len(self.chunks)}). Skipping.")

        print(f"Keyword search returned {len(results)} results.")

        return results


    def hybrid_search(self, query: str, top_k: int = 10, source_type: Optional[str] = None) -> List[SearchResult]:
        """Perform hybrid search combining semantic and keyword search."""
        print(f"üîç Searching for: '{query}'")

        # Perform both searches
        semantic_results = self.semantic_search(query, top_k * 2, source_type) # Fetch more results for RRF
        keyword_results = self.keyword_search(query, top_k * 2) # Fetch more results for RRF

        print(f"üìä Semantic results: {len(semantic_results)}, Keyword results: {len(keyword_results)}")

        # Apply RRF
        print("Applying Reciprocal Rank Fusion (RRF)...")
        fused_results = reciprocal_rank_fusion(semantic_results, keyword_results, CONFIG["rrf_k"])
        print(f"RRF produced {len(fused_results)} fused results.")

        # Prepare for Re-ranking
        print("Preparing for Re-ranking...")
        rerank_pairs = []
        # Create a dictionary to quickly look up chunk by ID
        chunk_dict = {chunk.id: chunk for chunk in self.chunks}

        # Limit the number of results sent to the reranker to a reasonable amount
        rerank_limit = min(len(fused_results), top_k * 3) # Rerank a bit more than final top_k
        print(f"Sending top {rerank_limit} fused results to reranker.")

        rerank_chunk_ids = [item[0] for item in fused_results[:rerank_limit]]
        for chunk_id in rerank_chunk_ids:
            if chunk_id in chunk_dict:
                chunk_text = chunk_dict[chunk_id].text
                rerank_pairs.append([query, chunk_text])
            else:
                 print(f"Warning: Chunk ID {chunk_id} not found in chunk_dict for reranking.")


        # Perform Re-ranking
        print(f"Performing Re-ranking using BAAI/bge-reranker-large on {len(rerank_pairs)} pairs...")
        if rerank_pairs:
            # Ensure reranker_model is accessible (assuming it's loaded in a previous cell)
            try:
                rerank_scores = reranker_model.predict(rerank_pairs)
                print("‚úÖ Re-ranking completed.")
            except NameError:
                print("‚ùå Reranker model not found. Please ensure 'reranker_model' is loaded.")
                rerank_scores = [0.0] * len(rerank_pairs) # Assign dummy scores if model not found
                # Use RRF scores if reranker fails
                reranked_results_with_scores = fused_results[:top_k]
                print("Using RRF scores due to reranker error.")
                # Convert to SearchResult objects using the RRF order and scores
                search_results = []
                print(f"Converting RRF results to SearchResult objects (Top {top_k})...")
                for rank, (chunk_id, score) in enumerate(reranked_results_with_scores[:top_k]):
                     if chunk_id in chunk_dict:
                        result = SearchResult(
                            chunk=chunk_dict[chunk_id],
                            score=score, # Use the RRF score
                            rank=rank + 1,
                            search_type="hybrid_rrf" # Indicate RRF results
                        )
                        search_results.append(result)
                        print(f"Added RRF result for chunk ID: {chunk_id}")
                     else:
                        print(f"Warning: Chunk ID {chunk_id} not found in chunk_dict after RRF.")

                print(f"‚úÖ Hybrid search completed. Returning {len(search_results)} results.")
                return search_results


            # Associate rerank scores with chunk IDs
            reranked_results_with_scores = sorted(zip(rerank_chunk_ids, rerank_scores), key=lambda x: x[1], reverse=True)
            print(f"Re-ranked {len(reranked_results_with_scores)} results.")

            # Convert to SearchResult objects using the re-ranked order and scores
            search_results = []
            print(f"Converting re-ranked results to SearchResult objects (Top {top_k}...):")
            for rank, (chunk_id, score) in enumerate(reranked_results_with_scores[:top_k]):
                 if chunk_id in chunk_dict:
                    result = SearchResult(
                        chunk=chunk_dict[chunk_id],
                        score=score, # Use the rerank score
                        rank=rank + 1,
                        search_type="hybrid_reranked" # Indicate re-ranked
                    )
                    search_results.append(result)
                    print(f"Added re-ranked result for chunk ID: {chunk_id}")
                 else:
                    print(f"Warning: Chunk ID {chunk_id} not found in chunk_dict after reranking.")

        else:
            search_results = []
            print("‚ö†Ô∏è No pairs for re-ranking.")


        print(f"‚úÖ Hybrid search completed. Returning {len(search_results)} results.")
        return search_results


    def display_results(self, results: List[SearchResult]):
        """Display search results in a formatted way."""
        if not results:
            print("‚ùå No results found")
            return

        print(f"\nüéØ Found {len(results)} results:\n")
        print("=" * 80)

        for result in results:
            chunk = result.chunk
            print(f"üìÑ Rank #{result.rank} | Score: {result.score:.4f} | Type: {chunk.source_type} | Search Type: {result.search_type}") # Added Search Type
            print(f"üìç Page {chunk.page_number + 1} | ID: {chunk.id}")

            # Note: Image data is not loaded into chunks in HybridSearchEngine
            # if chunk.source_type == "image" and chunk.image_data:
            #     print("üñºÔ∏è  IMAGE CONTENT:")
            #     # Display image (requires image data)
            #     img = Image.open(io.BytesIO(chunk.image_data))
            #     display(img.resize((200, 200)))
            if chunk.source_type == "image" and chunk.image_description:
                 print("üñºÔ∏è  IMAGE CONTENT (Description Only):")
                 print(chunk.image_description[:200] + ("..." if len(chunk.image_description) > 200 else ""))
            elif chunk.source_type == "image":
                 print("üñºÔ∏è  IMAGE CONTENT (No Description Available).")


            print("üìù Content:")
            print(chunk.text[:500] + ("..." if len(chunk.text) > 500 else ""))
            print("=" * 80)

**Reasoning**:
Review the keyword_search method to confirm it uses the internally loaded BM25 index and chunks.



In [None]:
# Ingest the uploaded PDF
# uploaded_pdf_path = "/content/series-150s-and-157s-low-water-cut-offs-pump-controllers-mm-217p.pdf" # Correct path in Colab
uploaded_pdf_path="/content/3102-flygt-workshop_manual.pdf"

ingester.ingest_document(uploaded_pdf_path)
qdrant_client = create_qdrant_client()
# Re-initialize the search engine after ingestion
search_engine = HybridSearchEngine(qdrant_client)


FileNotFoundError: no such file: '/content/3102-flygt-workshop_manual.pdf'

**Reasoning**:
Review the `_load_chunks_from_qdrant` and `display_results` methods to confirm they correctly handle document and page information.



In [None]:
# Assuming search_engine and the reranker_model are already initialized and available

# Perform the hybrid search with re-ranking
reranked_results = search_engine.hybrid_search("How do I replace the mechanical seal on the Flygt 3102?", top_k=3)

# Display the re-ranked results
search_engine.display_results(reranked_results)

In [None]:
# Assuming reranked_results and genai are available

def format_results_with_llm(results: List[SearchResult], query: str) -> str:
    """Formats search results into a readable summary using an LLM, including tables and image references, and document/page references."""
    if not results:
        return "No results found."

    context = "Search Query: " + query + "\n\nSearch Results:\n"
    for i, result in enumerate(results):
        # Include document and page reference in the header
        doc_reference = result.chunk.metadata.get("pdf_path", "Unknown Document")
        page_reference = result.chunk.page_number + 1
        context += f"--- Result {i+1} (Document: {doc_reference}, Page: {page_reference}, Score: {result.score:.4f}) ---\n"

        # Include image information if available
        if result.chunk.source_type == "image" and result.chunk.image_description:
            context += f"Image Description: {result.chunk.image_description}\n"
            context += "Reference to Image included in this chunk.\n" # Indicate image presence
        elif result.chunk.image_data: # Handle cases where description might be missing but image data exists
             context += "Reference to Image included in this chunk.\n"


        # Include text content, attempt to preserve table formatting
        context += result.chunk.text[:1000] + ("..." if len(result.chunk.text) > 1000 else "") + "\n\n"


    # prompt = f"""Based on the following search query and search results, provide a concise and readable summary of the key information.
    # For each piece of information in the summary, include a reference to the document and page number it was pulled from (e.g., [Document: report.pdf, Page: 5]).
    # If there is associated part number, show that too. If there is any parenthetical phrase or additional information add that too (e.g. additional text)
    # Pay close attention to the formatting of the original text, especially for tables, and try to reproduce them accurately in markdown format.
    # Also, mention if any of the results included images.
    # Show the summary in markdown in new line instead of one single line statement

    # {context}

    # Summary:
    # """


    prompt = f"""You are a technical assistant helping field technicians perform maintenance on industrial equipment using official manuals.
    Your goal is to extract clear, complete, and accurate instructions from the provided documentation to help the technician answer their question.
    Respond to the question below using only the content from the provided manual or documentation.
    ---
    Question:
    {context}
    ---
    When answering:
    - Break the response into clear **step-by-step instructions**, using bullet points or numbered steps.
    - Include **model-specific notes** if the document mentions multiple variants (e.g. 3127 vs. 3102).
    - Include **tool names**, **part numbers**, **measurements**, **torque specs**, and **safety precautions** if mentioned.
    - Reference the **exact page number(s)** where the information is found.
    - If the steps are only partially described, clearly state what is **missing** or **not covered** in the documentation.
    - Use a tone that is professional, instructional, and assumes the user has technical knowledge but needs procedural clarity.
    - Avoid assumptions or external knowledge. If something is unclear or missing, say so.
    ---
    If diagrams or figures are referenced in the document, include them in the explanation or summarize what they show."""

    try:
        # Initialize Gemini model (assuming it's already configured)
        model = genai.GenerativeModel('gemini-1.5-flash')
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        print(f"Error formatting results with LLM: {e}")
        return "Error generating summary with LLM."

# Assuming reranked_results from the previous step is available
if 'reranked_results' in locals():
    llm_summary = format_results_with_llm(reranked_results, "How do I replace the mechanical seal on the Flygt 3102?")
    print("\n--- LLM Summary ---")
    print(llm_summary)
else:
    print("‚ùå Reranked results not found. Please run the previous steps.")

# Task
Update the document ingestion process to handle markdown files in addition to PDF files.

## Modify documentingester

### Subtask:
Update the `DocumentIngester` class to include a method for extracting text from markdown files.


**Reasoning**:
Define the `extract_markdown_content` method within the `DocumentIngester` class to handle markdown file processing and chunking.



In [None]:
class DocumentIngester:
    def __init__(self, qdrant_client: QdrantClient):
        self.qdrant_client = qdrant_client
        self.chunks = []
        self.bm25_corpus = []
        self.bm25_index = None
        self.bm25_corpus_file = "bm25_calcom_corpus.json"

        # Delete collection if it exists (keep for fresh start in demo)
        # try:
        #     self.qdrant_client.delete_collection(collection_name=CONFIG["collection_name"])
        #     print(f"‚úÖ Deleted existing collection: {CONFIG['collection_name']}")
        # except Exception as e:
        #     print(f"Collection might not exist or error deleting: {e}")


        # Create collection if it doesn't exist
        try:
            self.qdrant_client.get_collection(collection_name=CONFIG["collection_name"])
            print(f"‚úÖ Collection '{CONFIG['collection_name']}' already exists.")
        except Exception: # Collection does not exist
            try:
                self.qdrant_client.create_collection(
                    collection_name=CONFIG["collection_name"],
                    vectors_config=VectorParams(
                        size=CONFIG["vector_dimension"],
                        distance=Distance.COSINE
                    )
                )
                print(f"‚úÖ Created collection: {CONFIG['collection_name']}")
            except Exception as e:
                print(f"‚ùå Error creating collection: {e}")

        # Attempt to load BM25 corpus
        self.load_bm25_corpus(self.bm25_corpus_file)

        # Rebuild BM25 index if corpus was loaded
        if self.bm25_corpus:
            try:
                self.bm25_index = BM25Okapi(self.bm25_corpus)
                print("‚úÖ BM25 index rebuilt from loaded corpus.")
            except Exception as e:
                print(f"‚ùå Error rebuilding BM25 index from loaded corpus: {e}")
                self.bm25_index = None

    def describe_image_with_gemini(self, image_bytes: bytes) -> str:
        """Generate image description using Gemini Vision."""
        try:
            # Convert bytes to PIL Image
            image = Image.open(io.BytesIO(image_bytes))

            # Initialize Gemini Vision model
            # Ensure you have initialized genai with your API key before calling this function
            model = genai.GenerativeModel('gemini-1.5-flash')

            # prompt = """Describe this image in detail. Focus on:
            # 1. Main objects, people, or elements, or technical diagrams
            # 2. Text content (if any)
            # 3. Charts, diagrams, or technical content
            # 4. Spatial relationships and layout
            # 5. Any relevant contextual information

            # Provide a comprehensive description that would help in document search."""
            prompt = """"Describe the technical diagrams and tables found in the provided document. For each description, follow these steps:

            Purpose: Begin by stating the primary function or purpose of the diagram or table (e.g., 'This is a wiring diagram showing the electrical connections,' or 'This table provides the technical specifications for the product').

            Components: Provide a detailed breakdown of the visual elements.

            For diagrams: List and explain each labeled component, symbol, or annotation. Describe the relationships or processes shown by arrows or other visual cues.

            For tables: Identify and explain the meaning of each column and row. Highlight the key data points, including any values and units of measurement.

            Synthesis: Conclude with a summary of the most important information presented, explaining how the different parts of the diagram or table work together to convey a complete message."""

            response = model.generate_content([prompt, image])
            return response.text
        except Exception as e:
            print(f"Error describing image: {e}")
            return "Image description unavailable"

    def extract_pdf_content(self, pdf_path: str) -> List[DocumentChunk]:
        """Extract text and images from PDF."""
        doc = fitz.open(pdf_path)
        all_chunks = []

        print(f"üìÑ Processing PDF with {min(len(doc), 10)} pages (limited to first 10 for trial)...") # Updated print statement

        # Process only the first 10 pages
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            print(f"Processing page {page_num + 1}...")

            # Extract text
            text = page.get_text()
            if text.strip():
                print(f"Extracted text from page {page_num + 1}. Text length: {len(text)}")
                # Chunk the text
                text_chunks = extract_and_chunk_text(text, CONFIG["chunk_size"], CONFIG["chunk_overlap"])
                print(f"Split into {len(text_chunks)} text chunks on page {page_num + 1}")
                print(f"Debug: text_chunks content for page {page_num + 1}: {text_chunks}, Type: {type(text_chunks)}") # Added debug log


                if text_chunks: # Add check here
                    for i, chunk_text in enumerate(text_chunks):
                        print(f"Debug: Processing text chunk {i} on page {page_num + 1}: {chunk_text[:50]}...") # Added debug log
                        chunk_id = f"page_{page_num}_chunk_{i}"
                        chunk = DocumentChunk(
                            id=chunk_id,
                            text=chunk_text,
                            page_number=page_num,
                            source_type="document",
                            metadata={"pdf_path": pdf_path}
                        )
                        all_chunks.append(chunk)
                else:
                    print(f"‚ö†Ô∏è  No text chunks generated for page {page_num + 1}") # Added log


            # Extract images
            image_list = page.get_images()
            print(f"Found {len(image_list)} images on page {page_num + 1}")
            for img_index, img in enumerate(image_list):
                try:
                    # Get image data
                    xref = img[0]
                    pix = fitz.Pixmap(doc, xref)

                    # Convert pixmap to RGB if it's not already in a compatible format
                    if pix.n > 3: # CMYK or other formats
                         pix = fitz.Pixmap(fitz.csRGB, pix)
                    elif pix.n == 1: # Grayscale
                         pix = fitz.Pixmap(fitz.csRGB, pix) # Convert grayscale to RGB


                    img_data = pix.tobytes("png")

                    # Generate description
                    print(f"Describing image {img_index} on page {page_num + 1}...")
                    description = self.describe_image_with_gemini(img_data) # Use self.describe_image_with_gemini
                    print(f"Image description generated for page {page_num + 1}: {description[:50]}...")

                    # Create image chunk
                    chunk_id = f"page_{page_num}_image_{img_index}"
                    chunk = DocumentChunk(
                        id=chunk_id,
                        text=description,
                        page_number=page_num,
                        source_type="image",
                        image_description=description,
                        image_data=img_data,
                        metadata={"pdf_path": pdf_path}
                    )
                    all_chunks.append(chunk)
                    print(f"üñºÔ∏è  Processed image {img_index} on page {page_num + 1}")

                    pix = None
                except Exception as e:
                    print(f"Error processing image {img_index} on page {page_num}: {e}")

        doc.close()
        print(f"‚úÖ Extracted {len(all_chunks)} chunks from PDF")
        return all_chunks

    def extract_markdown_content(self, md_path: str) -> List[DocumentChunk]:
        """Extract text from markdown files and chunk it."""
        all_chunks = []
        try:
            with open(md_path, 'r', encoding='utf-8') as f:
                markdown_text = f.read()

            # Simple markdown cleaning (remove headers, lists, etc.) - can be improved
            # This is a basic approach; a dedicated markdown parser would be better for complex docs
            cleaned_text = []
            for line in markdown_text.splitlines():
                line = line.strip()
                if line and not line.startswith('#') and not line.startswith('-') and not line.startswith('*') and not line.startswith('>'):
                    cleaned_text.append(line)
            cleaned_text = " ".join(cleaned_text)

            if cleaned_text.strip():
                print(f"Extracted text from markdown {md_path}. Text length: {len(cleaned_text)}")
                # Chunk the text
                text_chunks = extract_and_chunk_text(cleaned_text, CONFIG["chunk_size"], CONFIG["chunk_overlap"])
                print(f"Split into {len(text_chunks)} text chunks from markdown")

                if text_chunks:
                    for i, chunk_text in enumerate(text_chunks):
                        chunk_id = f"{os.path.basename(md_path)}_chunk_{i}"
                        # Markdown doesn't have pages, use 0 or a unique identifier
                        chunk = DocumentChunk(
                            id=chunk_id,
                            text=chunk_text,
                            page_number=0, # Or a different indicator for markdown
                            source_type="markdown",
                            metadata={"markdown_path": md_path}
                        )
                        all_chunks.append(chunk)
                else:
                     print(f"‚ö†Ô∏è No text chunks generated for markdown file {md_path}")

            print(f"‚úÖ Extracted {len(all_chunks)} chunks from markdown {md_path}")
            return all_chunks

        except Exception as e:
            print(f"‚ùå Error extracting content from markdown file {md_path}: {e}")
            return []


    def ingest_chunks(self, chunks: List[DocumentChunk]):
        """Ingest chunks into vector database and BM25 index."""
        print("üîÑ Starting ingestion process...")
        print(f"Attempting to ingest {len(chunks)} chunks.")

        points = []
        newly_added_corpus = [] # Use a temporary list for current ingestion's corpus

        for i, chunk in enumerate(chunks):
            print(f"Processing chunk {i+1}/{len(chunks)} (ID: {chunk.id}, Type: {chunk.source_type})")
            # Get embedding
            embedding = get_gemini_embedding_with_retry(chunk.text, "RETRIEVAL_DOCUMENT")
            if embedding is None:
                print(f"‚ö†Ô∏è  Skipping chunk {chunk.id} - failed to get embedding")
                continue
            print(f"‚úÖ Embedding generated for chunk {chunk.id}")

            # Prepare point for Qdrant
            point = PointStruct(
                id=len(self.chunks) + len(points), # Ensure unique IDs across multiple ingestions
                vector=embedding,
                payload={
                    "chunk_id": chunk.id,
                    "text": chunk.text,
                    "page_number": chunk.page_number,
                    "source_type": chunk.source_type,
                    "image_description": chunk.image_description,
                    "has_image": chunk.image_data is not None,
                    "metadata": chunk.metadata or {}
                }
            )
            points.append(point)
            print(f"‚úÖ Point prepared for chunk {chunk.id}. Total points prepared: {len(points)}")


            # Prepare for BM25 using NLTK and stop words
            try:
                tokens = word_tokenize(chunk.text.lower())
                filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
                newly_added_corpus.append(filtered_tokens)
                print(f"‚úÖ Tokenized and filtered chunk {chunk.id} for BM25.")
            except Exception as e:
                print(f"‚ùå Error tokenizing/filtering chunk {chunk.id} for BM25: {e}")
                newly_added_corpus.append([]) # Add empty list to maintain corpus length


            # Store chunk reference
            self.chunks.extend(chunks) # Extend chunks with the new chunks
            print(f"Debug: self.chunks length after adding current chunks: {len(self.chunks)}")


        print(f"Prepared {len(points)} points for upserting to Qdrant.")
        if points:
            # Upsert to Qdrant
            try:
                response = self.qdrant_client.upsert(
                    collection_name=CONFIG["collection_name"],
                    points=points,
                    wait=True # Wait for the operation to complete
                )
                print(f"‚úÖ Qdrant upsert response: {response}")
                print(f"‚úÖ Upserted {len(points)} points to Qdrant.")
            except Exception as e:
                print(f"‚ùå Error during Qdrant upsert: {e}")

        else:
            print("‚ö†Ô∏è No points to upsert to Qdrant.")

        # Accumulate the new corpus
        self.bm25_corpus.extend(newly_added_corpus)

        print(f"Debug: Corpus content after adding newly added corpus: {self.bm25_corpus}, Type: {type(self.bm25_corpus)}") # Added debug log
        # Build BM25 index from the accumulated corpus
        if self.bm25_corpus and any(self.bm25_corpus): # Check if corpus is not empty and contains non-empty token lists
            self.bm25_index = BM25Okapi(self.bm25_corpus)
            print(f"‚úÖ BM25 index built with {len(self.bm25_corpus)} documents.")

            # Save BM25 corpus to file
            self.save_bm25_corpus() # Call the new method

        else:
            self.bm25_index = None # Ensure index is None if corpus is empty
            self.bm25_corpus = []
            print("‚ö†Ô∏è No corpus for building BM25 index.")


        print(f"‚úÖ Successfully processed {len(points)} chunks for ingestion")


    def ingest_document(self, doc_path: str):
        """Complete ingestion pipeline for a document (PDF or Markdown)."""
        if doc_path.lower().endswith('.pdf'):
            chunks = self.extract_pdf_content(doc_path)
        elif doc_path.lower().endswith('.md'):
            chunks = self.extract_markdown_content(doc_path)
        else:
            print(f"‚ùå Unsupported file type: {doc_path}")
            return

        self.ingest_chunks(chunks)
        self.save_bm25_corpus() # Save the corpus after processing each document

    def load_bm25_corpus(self, file_path: str):
        """Loads the tokenized BM25 corpus from a JSON file."""
        if os.path.exists(file_path):
            try:
                with open(file_path, "r") as f:
                    loaded_corpus = json.load(f)
                    if isinstance(loaded_corpus, list):
                        self.bm25_corpus = loaded_corpus
                    else:
                        print(f"‚ö†Ô∏è Loaded BM25 corpus from {file_path} is not a list. Initializing with empty corpus.")
                        self.bm25_corpus = []
                print(f"‚úÖ BM25 corpus loaded from {file_path}")
            except Exception as e:
                print(f"‚ùå Error loading BM25 corpus from {file_path}: {e}")
                self.bm25_corpus = []
        else:
            print(f"‚ö†Ô∏è BM25 corpus file not found at {file_path}. BM25 index will be built during ingestion.")
            self.bm25_corpus = []


    def save_bm25_corpus(self):
        """Saves the tokenized BM25 corpus to a JSON file."""
        try:
            with open(self.bm25_corpus_file, "w") as f:
                json.dump(self.bm25_corpus, f)
            print(f"‚úÖ BM25 corpus saved to {self.bm25_corpus_file}")
        except Exception as e:
            print(f"‚ùå Error saving BM25 corpus: {e}")


**Reasoning**:
Ingest the newly created markdown file using the `ingester.ingest_document()` method and observe the output.



In [None]:
class DocumentIngester:
    def __init__(self, qdrant_client: QdrantClient):
        self.qdrant_client = qdrant_client
        self.chunks = []
        self.bm25_corpus = []
        self.bm25_index = None
        self.bm25_corpus_file = "bm25_calcom_corpus.json"

        # Delete collection if it exists (keep for fresh start in demo)
        # try:
        #     self.qdrant_client.delete_collection(collection_name=CONFIG["collection_name"])
        #     print(f"‚úÖ Deleted existing collection: {CONFIG['collection_name']}")
        # except Exception as e:
        #     print(f"Collection might not exist or error deleting: {e}")


        # Create collection if it doesn't exist
        try:
            self.qdrant_client.get_collection(collection_name=CONFIG["collection_name"])
            print(f"‚úÖ Collection '{CONFIG['collection_name']}' already exists.")
        except Exception: # Collection does not exist
            try:
                self.qdrant_client.create_collection(
                    collection_name=CONFIG["collection_name"],
                    vectors_config=VectorParams(
                        size=CONFIG["vector_dimension"],
                        distance=Distance.COSINE
                    )
                )
                print(f"‚úÖ Created collection: {CONFIG['collection_name']}")
            except Exception as e:
                print(f"‚ùå Error creating collection: {e}")

        # Attempt to load BM25 corpus
        self.load_bm25_corpus(self.bm25_corpus_file)

        # Rebuild BM25 index if corpus was loaded
        if self.bm25_corpus:
            try:
                self.bm25_index = BM25Okapi(self.bm25_corpus)
                print("‚úÖ BM25 index rebuilt from loaded corpus.")
            except Exception as e:
                print(f"‚ùå Error rebuilding BM25 index from loaded corpus: {e}")
                self.bm25_index = None

    def describe_image_with_gemini(self, image_bytes: bytes) -> str:
        """Generate image description using Gemini Vision."""
        try:
            # Convert bytes to PIL Image
            image = Image.open(io.BytesIO(image_bytes))

            # Initialize Gemini Vision model
            # Ensure you have initialized genai with your API key before calling this function
            model = genai.GenerativeModel('gemini-1.5-flash')

            # prompt = """Describe this image in detail. Focus on:
            # 1. Main objects, people, or elements, or technical diagrams
            # 2. Text content (if any)
            # 3. Charts, diagrams, or technical content
            # 4. Spatial relationships and layout
            # 5. Any relevant contextual information

            # Provide a comprehensive description that would help in document search."""
            prompt = """"Describe the technical diagrams and tables found in the provided document. For each description, follow these steps:

            Purpose: Begin by stating the primary function or purpose of the diagram or table (e.g., 'This is a wiring diagram showing the electrical connections,' or 'This table provides the technical specifications for the product').

            Components: Provide a detailed breakdown of the visual elements.

            For diagrams: List and explain each labeled component, symbol, or annotation. Describe the relationships or processes shown by arrows or other visual cues.

            For tables: Identify and explain the meaning of each column and row. Highlight the key data points, including any values and units of measurement.

            Synthesis: Conclude with a summary of the most important information presented, explaining how the different parts of the diagram or table work together to convey a complete message."""

            response = model.generate_content([prompt, image])
            return response.text
        except Exception as e:
            print(f"Error describing image: {e}")
            return "Image description unavailable"

    def extract_pdf_content(self, pdf_path: str) -> List[DocumentChunk]:
        """Extract text and images from PDF."""
        doc = fitz.open(pdf_path)
        all_chunks = []

        print(f"üìÑ Processing PDF with {min(len(doc), 10)} pages (limited to first 10 for trial)...") # Updated print statement

        # Process only the first 10 pages
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            print(f"Processing page {page_num + 1}...")

            # Extract text
            text = page.get_text()
            if text.strip():
                print(f"Extracted text from page {page_num + 1}. Text length: {len(text)}")
                # Chunk the text
                text_chunks = extract_and_chunk_text(text, CONFIG["chunk_size"], CONFIG["chunk_overlap"])
                print(f"Split into {len(text_chunks)} text chunks on page {page_num + 1}")
                print(f"Debug: text_chunks content for page {page_num + 1}: {text_chunks}, Type: {type(text_chunks)}") # Added debug log


                if text_chunks: # Add check here
                    for i, chunk_text in enumerate(text_chunks):
                        print(f"Debug: Processing text chunk {i} on page {page_num + 1}: {chunk_text[:50]}...") # Added debug log
                        chunk_id = f"page_{page_num}_chunk_{i}"
                        chunk = DocumentChunk(
                            id=chunk_id,
                            text=chunk_text,
                            page_number=page_num,
                            source_type="document",
                            metadata={"pdf_path": pdf_path}
                        )
                        all_chunks.append(chunk)
                else:
                    print(f"‚ö†Ô∏è  No text chunks generated for page {page_num + 1}") # Added log


            # Extract images
            image_list = page.get_images()
            print(f"Found {len(image_list)} images on page {page_num + 1}")
            for img_index, img in enumerate(image_list):
                try:
                    # Get image data
                    xref = img[0]
                    pix = fitz.Pixmap(doc, xref)

                    # Convert pixmap to RGB if it's not already in a compatible format
                    if pix.n > 3: # CMYK or other formats
                         pix = fitz.Pixmap(fitz.csRGB, pix)
                    elif pix.n == 1: # Grayscale
                         pix = fitz.Pixmap(fitz.csRGB, pix) # Convert grayscale to RGB


                    img_data = pix.tobytes("png")

                    # Generate description
                    print(f"Describing image {img_index} on page {page_num + 1}...")
                    description = self.describe_image_with_gemini(img_data) # Use self.describe_image_with_gemini
                    print(f"Image description generated for page {page_num + 1}: {description[:50]}...")

                    # Create image chunk
                    chunk_id = f"page_{page_num}_image_{img_index}"
                    chunk = DocumentChunk(
                        id=chunk_id,
                        text=description,
                        page_number=page_num,
                        source_type="image",
                        image_description=description,
                        image_data=img_data,
                        metadata={"pdf_path": pdf_path}
                    )
                    all_chunks.append(chunk)
                    print(f"üñºÔ∏è  Processed image {img_index} on page {page_num + 1}")

                    pix = None
                except Exception as e:
                    print(f"Error processing image {img_index} on page {page_num}: {e}")

        doc.close()
        print(f"‚úÖ Extracted {len(all_chunks)} chunks from PDF")
        return all_chunks

    def extract_markdown_content(self, md_path: str) -> List[DocumentChunk]:
        """Extract text from markdown files and chunk it."""
        all_chunks = []
        try:
            with open(md_path, 'r', encoding='utf-8') as f:
                markdown_text = f.read()

            # Simple markdown cleaning (remove headers, lists, etc.) - can be improved
            # This is a basic approach; a dedicated markdown parser would be better for complex docs
            cleaned_text = []
            for line in markdown_text.splitlines():
                line = line.strip()
                # Keep lines that are not headers, lists, blockquotes, or horizontal rules
                if line and not line.startswith('#') and not line.startswith('- ') and not line.startswith('* ') and not line.startswith('> ') and not line.startswith('---') and not line.startswith('***') and not line.startswith('___'):
                     cleaned_text.append(line)
            cleaned_text = " ".join(cleaned_text)


            if cleaned_text.strip():
                print(f"Extracted text from markdown {md_path}. Text length: {len(cleaned_text)}")
                # Chunk the text
                text_chunks = extract_and_chunk_text(cleaned_text, CONFIG["chunk_size"], CONFIG["chunk_overlap"])
                print(f"Split into {len(text_chunks)} text chunks from markdown")

                if text_chunks:
                    for i, chunk_text in enumerate(text_chunks):
                        chunk_id = f"{os.path.basename(md_path)}_chunk_{i}"
                        # Markdown doesn't have pages, use 0 or a unique identifier
                        chunk = DocumentChunk(
                            id=chunk_id,
                            text=chunk_text,
                            page_number=0, # Or a different indicator for markdown
                            source_type="markdown",
                            metadata={"markdown_path": md_path}
                        )
                        all_chunks.append(chunk)
                else:
                     print(f"‚ö†Ô∏è No text chunks generated for markdown file {md_path}")

            print(f"‚úÖ Extracted {len(all_chunks)} chunks from markdown {md_path}")
            return all_chunks

        except Exception as e:
            print(f"‚ùå Error extracting content from markdown file {md_path}: {e}")
            return []


    def ingest_chunks(self, chunks: List[DocumentChunk]):
        """Ingest chunks into vector database and BM25 index."""
        print("üîÑ Starting ingestion process...")
        print(f"Attempting to ingest {len(chunks)} chunks.")

        points = []
        newly_added_corpus = [] # Use a temporary list for current ingestion's corpus

        for i, chunk in enumerate(chunks):
            print(f"Processing chunk {i+1}/{len(chunks)} (ID: {chunk.id}, Type: {chunk.source_type})")
            # Get embedding
            embedding = get_gemini_embedding_with_retry(chunk.text, "RETRIEVAL_DOCUMENT")
            if embedding is None:
                print(f"‚ö†Ô∏è  Skipping chunk {chunk.id} - failed to get embedding")
                continue
            print(f"‚úÖ Embedding generated for chunk {chunk.id}")

            # Prepare point for Qdrant
            point = PointStruct(
                id=len(self.chunks) + len(points), # Ensure unique IDs across multiple ingestions
                vector=embedding,
                payload={
                    "chunk_id": chunk.id,
                    "text": chunk.text,
                    "page_number": chunk.page_number,
                    "source_type": chunk.source_type,
                    "image_description": chunk.image_description,
                    "has_image": chunk.image_data is not None,
                    "metadata": chunk.metadata or {}
                }
            )
            points.append(point)
            print(f"‚úÖ Point prepared for chunk {chunk.id}. Total points prepared: {len(points)}")


            # Prepare for BM25 using NLTK and stop words
            try:
                tokens = word_tokenize(chunk.text.lower())
                filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
                newly_added_corpus.append(filtered_tokens)
                print(f"‚úÖ Tokenized and filtered chunk {chunk.id} for BM25.")
            except Exception as e:
                print(f"‚ùå Error tokenizing/filtering chunk {chunk.id} for BM25: {e}")
                newly_added_corpus.append([]) # Add empty list to maintain corpus length


            # Store chunk reference
            self.chunks.extend(chunks) # Extend chunks with the new chunks
            print(f"Debug: self.chunks length after adding current chunks: {len(self.chunks)}")


        print(f"Prepared {len(points)} points for upserting to Qdrant.")
        if points:
            # Upsert to Qdrant
            try:
                response = self.qdrant_client.upsert(
                    collection_name=CONFIG["collection_name"],
                    points=points,
                    wait=True # Wait for the operation to complete
                )
                print(f"‚úÖ Qdrant upsert response: {response}")
                print(f"‚úÖ Upserted {len(points)} points to Qdrant.")
            except Exception as e:
                print(f"‚ùå Error during Qdrant upsert: {e}")

        else:
            print("‚ö†Ô∏è No points to upsert to Qdrant.")

        # Accumulate the new corpus
        self.bm25_corpus.extend(newly_added_corpus)

        print(f"Debug: Corpus content after adding newly added corpus: {self.bm25_corpus}, Type: {type(self.bm25_corpus)}") # Added debug log
        # Build BM25 index from the accumulated corpus
        if self.bm25_corpus and any(self.bm25_corpus): # Check if corpus is not empty and contains non-empty token lists
            self.bm25_index = BM25Okapi(self.bm25_corpus)
            print(f"‚úÖ BM25 index built with {len(self.bm25_corpus)} documents.")

            # Save BM25 corpus to file
            self.save_bm25_corpus() # Call the new method

        else:
            self.bm25_index = None # Ensure index is None if corpus is empty
            self.bm25_corpus = []
            print("‚ö†Ô∏è No corpus for building BM25 index.")


        print(f"‚úÖ Successfully processed {len(points)} chunks for ingestion")


    def ingest_document(self, doc_path: str):
        """Complete ingestion pipeline for a document (PDF or Markdown)."""
        if doc_path.lower().endswith('.pdf'):
            chunks = self.extract_pdf_content(doc_path)
        elif doc_path.lower().endswith('.md'):
            chunks = self.extract_markdown_content(doc_path)
        else:
            print(f"‚ùå Unsupported file type: {doc_path}")
            return

        self.ingest_chunks(chunks)
        self.save_bm25_corpus() # Save the corpus after processing each document


    def load_bm25_corpus(self, file_path: str):
        """Loads the tokenized BM25 corpus from a JSON file."""
        if os.path.exists(file_path):
            try:
                with open(file_path, "r") as f:
                    loaded_corpus = json.load(f)
                    if isinstance(loaded_corpus, list):
                        self.bm25_corpus = loaded_corpus
                    else:
                        print(f"‚ö†Ô∏è Loaded BM25 corpus from {file_path} is not a list. Initializing with empty corpus.")
                        self.bm25_corpus = []
                print(f"‚úÖ BM25 corpus loaded from {file_path}")
            except Exception as e:
                print(f"‚ùå Error loading BM25 corpus from {file_path}: {e}")
                self.bm25_corpus = []
        else:
            print(f"‚ö†Ô∏è BM25 corpus file not found at {file_path}. BM25 index will be built during ingestion.")
            self.bm25_corpus = []


    def save_bm25_corpus(self):
        """Saves the tokenized BM25 corpus to a JSON file."""
        try:
            with open(self.bm25_corpus_file, "w") as f:
                json.dump(self.bm25_corpus, f)
            print(f"‚úÖ BM25 corpus saved to {self.bm25_corpus_file}")
        except Exception as e:
            print(f"‚ùå Error saving BM25 corpus: {e}")


**Reasoning**:
Now that the `ingest_document` method correctly handles markdown files, attempt to ingest the sample markdown file again to verify the ingestion process works as expected.



In [None]:
class DocumentIngester:
    def __init__(self, qdrant_client: QdrantClient):
        self.qdrant_client = qdrant_client
        self.chunks = []
        self.bm25_corpus = []
        self.bm25_index = None
        self.bm25_corpus_file = "bm25_calcom_corpus.json"

        # Delete collection if it exists (keep for fresh start in demo)
        # try:
        #     self.qdrant_client.delete_collection(collection_name=CONFIG["collection_name"])
        #     print(f"‚úÖ Deleted existing collection: {CONFIG['collection_name']}")
        # except Exception as e:
        #     print(f"Collection might not exist or error deleting: {e}")


        # Create collection if it doesn't exist
        try:
            self.qdrant_client.get_collection(collection_name=CONFIG["collection_name"])
            print(f"‚úÖ Collection '{CONFIG['collection_name']}' already exists.")
        except Exception: # Collection does not exist
            try:
                self.qdrant_client.create_collection(
                    collection_name=CONFIG["collection_name"],
                    vectors_config=VectorParams(
                        size=CONFIG["vector_dimension"],
                        distance=Distance.COSINE
                    )
                )
                print(f"‚úÖ Created collection: {CONFIG['collection_name']}")
            except Exception as e:
                print(f"‚ùå Error creating collection: {e}")

        # Attempt to load BM25 corpus
        self.load_bm25_corpus(self.bm25_corpus_file)

        # Rebuild BM25 index if corpus was loaded
        if self.bm25_corpus:
            try:
                self.bm25_index = BM25Okapi(self.bm25_corpus)
                print("‚úÖ BM25 index rebuilt from loaded corpus.")
            except Exception as e:
                print(f"‚ùå Error rebuilding BM25 index from loaded corpus: {e}")
                self.bm25_index = None

    def describe_image_with_gemini(self, image_bytes: bytes) -> str:
        """Generate image description using Gemini Vision."""
        try:
            # Convert bytes to PIL Image
            image = Image.open(io.BytesIO(image_bytes))

            # Initialize Gemini Vision model
            # Ensure you have initialized genai with your API key before calling this function
            model = genai.GenerativeModel('gemini-1.5-flash')

            # prompt = """Describe this image in detail. Focus on:
            # 1. Main objects, people, or elements, or technical diagrams
            # 2. Text content (if any)
            # 3. Charts, diagrams, or technical content
            # 4. Spatial relationships and layout
            # 5. Any relevant contextual information

            # Provide a comprehensive description that would help in document search."""
            prompt = """"Describe the technical diagrams and tables found in the provided document. For each description, follow these steps:

            Purpose: Begin by stating the primary function or purpose of the diagram or table (e.g., 'This is a wiring diagram showing the electrical connections,' or 'This table provides the technical specifications for the product').

            Components: Provide a detailed breakdown of the visual elements.

            For diagrams: List and explain each labeled component, symbol, or annotation. Describe the relationships or processes shown by arrows or other visual cues.

            For tables: Identify and explain the meaning of each column and row. Highlight the key data points, including any values and units of measurement.

            Synthesis: Conclude with a summary of the most important information presented, explaining how the different parts of the diagram or table work together to convey a complete message."""

            response = model.generate_content([prompt, image])
            return response.text
        except Exception as e:
            print(f"Error describing image: {e}")
            return "Image description unavailable"

    def extract_pdf_content(self, pdf_path: str) -> List[DocumentChunk]:
        """Extract text and images from PDF."""
        doc = fitz.open(pdf_path)
        all_chunks = []

        print(f"üìÑ Processing PDF with {min(len(doc), 10)} pages (limited to first 10 for trial)...") # Updated print statement

        # Process only the first 10 pages
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            print(f"Processing page {page_num + 1}...")

            # Extract text
            text = page.get_text()
            if text.strip():
                print(f"Extracted text from page {page_num + 1}. Text length: {len(text)}")
                # Chunk the text
                text_chunks = extract_and_chunk_text(text, CONFIG["chunk_size"], CONFIG["chunk_overlap"])
                print(f"Split into {len(text_chunks)} text chunks on page {page_num + 1}")
                print(f"Debug: text_chunks content for page {page_num + 1}: {text_chunks}, Type: {type(text_chunks)}") # Added debug log


                if text_chunks: # Add check here
                    for i, chunk_text in enumerate(text_chunks):
                        print(f"Debug: Processing text chunk {i} on page {page_num + 1}: {chunk_text[:50]}...") # Added debug log
                        chunk_id = f"page_{page_num}_chunk_{i}"
                        chunk = DocumentChunk(
                            id=chunk_id,
                            text=chunk_text,
                            page_number=page_num,
                            source_type="document",
                            metadata={"pdf_path": pdf_path}
                        )
                        all_chunks.append(chunk)
                else:
                    print(f"‚ö†Ô∏è  No text chunks generated for page {page_num + 1}") # Added log


            # Extract images
            image_list = page.get_images()
            print(f"Found {len(image_list)} images on page {page_num + 1}")
            for img_index, img in enumerate(image_list):
                try:
                    # Get image data
                    xref = img[0]
                    pix = fitz.Pixmap(doc, xref)

                    # Convert pixmap to RGB if it's not already in a compatible format
                    if pix.n > 3: # CMYK or other formats
                         pix = fitz.Pixmap(fitz.csRGB, pix)
                    elif pix.n == 1: # Grayscale
                         pix = fitz.Pixmap(fitz.csRGB, pix) # Convert grayscale to RGB


                    img_data = pix.tobytes("png")

                    # Generate description
                    print(f"Describing image {img_index} on page {page_num + 1}...")
                    description = self.describe_image_with_gemini(img_data) # Use self.describe_image_with_gemini
                    print(f"Image description generated for page {page_num + 1}: {description[:50]}...")

                    # Create image chunk
                    chunk_id = f"page_{page_num}_image_{img_index}"
                    chunk = DocumentChunk(
                        id=chunk_id,
                        text=description,
                        page_number=page_num,
                        source_type="image",
                        image_description=description,
                        image_data=img_data,
                        metadata={"pdf_path": pdf_path}
                    )
                    all_chunks.append(chunk)
                    print(f"üñºÔ∏è  Processed image {img_index} on page {page_num + 1}")

                    pix = None
                except Exception as e:
                    print(f"Error processing image {img_index} on page {page_num}: {e}")

        doc.close()
        print(f"‚úÖ Extracted {len(all_chunks)} chunks from PDF")
        return all_chunks

    def extract_markdown_content(self, md_path: str) -> List[DocumentChunk]:
        """Extract text from markdown files and chunk it."""
        all_chunks = []
        try:
            with open(md_path, 'r', encoding='utf-8') as f:
                markdown_text = f.read()

            # Simple markdown cleaning (remove headers, lists, etc.) - can be improved
            # This is a basic approach; a dedicated markdown parser would be better for complex docs
            cleaned_text = []
            for line in markdown_text.splitlines():
                line = line.strip()
                # Keep lines that are not headers, lists, blockquotes, or horizontal rules
                if line and not line.startswith('#') and not line.startswith('- ') and not line.startswith('* ') and not line.startswith('> ') and not line.startswith('---') and not line.startswith('***') and not line.startswith('___'):
                     cleaned_text.append(line)
            cleaned_text = " ".join(cleaned_text)


            if cleaned_text.strip():
                print(f"Extracted text from markdown {md_path}. Text length: {len(cleaned_text)}")
                # Chunk the text
                text_chunks = extract_and_chunk_text(cleaned_text, CONFIG["chunk_size"], CONFIG["chunk_overlap"])
                print(f"Split into {len(text_chunks)} text chunks from markdown")

                if text_chunks:
                    for i, chunk_text in enumerate(text_chunks):
                        chunk_id = f"{os.path.basename(md_path)}_chunk_{i}"
                        # Markdown doesn't have pages, use 0 or a unique identifier
                        chunk = DocumentChunk(
                            id=chunk_id,
                            text=chunk_text,
                            page_number=0, # Or a different indicator for markdown
                            source_type="markdown",
                            metadata={"markdown_path": md_path}
                        )
                        all_chunks.append(chunk)
                else:
                     print(f"‚ö†Ô∏è No text chunks generated for markdown file {md_path}")

            print(f"‚úÖ Extracted {len(all_chunks)} chunks from markdown {md_path}")
            return all_chunks

        except Exception as e:
            print(f"‚ùå Error extracting content from markdown file {md_path}: {e}")
            return []


    def ingest_chunks(self, chunks: List[DocumentChunk]):
        """Ingest chunks into vector database and BM25 index."""
        print("üîÑ Starting ingestion process...")
        print(f"Attempting to ingest {len(chunks)} chunks.")

        points = []
        newly_added_corpus = [] # Use a temporary list for current ingestion's corpus

        for i, chunk in enumerate(chunks):
            print(f"Processing chunk {i+1}/{len(chunks)} (ID: {chunk.id}, Type: {chunk.source_type})")
            # Get embedding
            embedding = get_gemini_embedding_with_retry(chunk.text, "RETRIEVAL_DOCUMENT")
            if embedding is None:
                print(f"‚ö†Ô∏è  Skipping chunk {chunk.id} - failed to get embedding")
                continue
            print(f"‚úÖ Embedding generated for chunk {chunk.id}")

            # Prepare point for Qdrant
            point = PointStruct(
                id=len(self.chunks) + len(points), # Ensure unique IDs across multiple ingestions
                vector=embedding,
                payload={
                    "chunk_id": chunk.id,
                    "text": chunk.text,
                    "page_number": chunk.page_number,
                    "source_type": chunk.source_type,
                    "image_description": chunk.image_description,
                    "has_image": chunk.image_data is not None,
                    "metadata": chunk.metadata or {}
                }
            )
            points.append(point)
            print(f"‚úÖ Point prepared for chunk {chunk.id}. Total points prepared: {len(points)}")


            # Prepare for BM25 using NLTK and stop words
            try:
                tokens = word_tokenize(chunk.text.lower())
                filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
                newly_added_corpus.append(filtered_tokens)
                print(f"‚úÖ Tokenized and filtered chunk {chunk.id} for BM25.")
            except Exception as e:
                print(f"‚ùå Error tokenizing/filtering chunk {chunk.id} for BM25: {e}")
                newly_added_corpus.append([]) # Add empty list to maintain corpus length


            # Store chunk reference
            self.chunks.extend(chunks) # Extend chunks with the new chunks
            print(f"Debug: self.chunks length after adding current chunks: {len(self.chunks)}")


        print(f"Prepared {len(points)} points for upserting to Qdrant.")
        if points:
            # Upsert to Qdrant
            try:
                response = self.qdrant_client.upsert(
                    collection_name=CONFIG["collection_name"],
                    points=points,
                    wait=True # Wait for the operation to complete
                )
                print(f"‚úÖ Qdrant upsert response: {response}")
                print(f"‚úÖ Upserted {len(points)} points to Qdrant.")
            except Exception as e:
                print(f"‚ùå Error during Qdrant upsert: {e}")

        else:
            print("‚ö†Ô∏è No points to upsert to Qdrant.")

        # Accumulate the new corpus
        self.bm25_corpus.extend(newly_added_corpus)

        print(f"Debug: Corpus content after adding newly added corpus: {self.bm25_corpus}, Type: {type(self.bm25_corpus)}") # Added debug log
        # Build BM25 index from the accumulated corpus
        if self.bm25_corpus and any(self.bm25_corpus): # Check if corpus is not empty and contains non-empty token lists
            self.bm25_index = BM25Okapi(self.bm25_corpus)
            print(f"‚úÖ BM25 index built with {len(self.bm25_corpus)} documents.")

            # Save BM25 corpus to file
            self.save_bm25_corpus() # Call the new method

        else:
            self.bm25_index = None # Ensure index is None if corpus is empty
            self.bm25_corpus = []
            print("‚ö†Ô∏è No corpus for building BM25 index.")


        print(f"‚úÖ Successfully processed {len(points)} chunks for ingestion")


    def ingest_document(self, doc_paths: str | List[str]):
        """Complete ingestion pipeline for a document or list of documents (PDF or Markdown)."""
        if isinstance(doc_paths, str):
            doc_paths = [doc_paths]
            print(f"Processing single document: {doc_paths[0]}")
        elif isinstance(doc_paths, list):
            print(f"Processing {len(doc_paths)} documents from a list.")
        else:
            print(f"‚ùå Unsupported input type for ingestion: {type(doc_paths)}")
            return

        all_chunks = []
        for doc_path in doc_paths:
            try:
                if doc_path.lower().endswith('.pdf'):
                    print(f"Extracting content from PDF: {doc_path}")
                    chunks = self.extract_pdf_content(doc_path)
                elif doc_path.lower().endswith('.md'):
                    print(f"Extracting content from Markdown: {doc_path}")
                    chunks = self.extract_markdown_content(doc_path)
                else:
                    print(f"‚ùå Skipping unsupported file type: {doc_path}")
                    continue
                all_chunks.extend(chunks)
            except Exception as e:
                print(f"‚ùå Error processing document {doc_path}: {e}")

        if all_chunks:
            self.ingest_chunks(all_chunks)
            self.save_bm25_corpus() # Save the corpus after processing each ingestion batch
        else:
            print("‚ö†Ô∏è No chunks extracted from the provided document(s).")


    def load_bm25_corpus(self, file_path: str):
        """Loads the tokenized BM25 corpus from a JSON file."""
        if os.path.exists(file_path):
            try:
                with open(file_path, "r") as f:
                    loaded_corpus = json.load(f)
                    if isinstance(loaded_corpus, list):
                        self.bm25_corpus = loaded_corpus
                    else:
                        print(f"‚ö†Ô∏è Loaded BM25 corpus from {file_path} is not a list. Initializing with empty corpus.")
                        self.bm25_corpus = []
                print(f"‚úÖ BM25 corpus loaded from {file_path}")
            except Exception as e:
                print(f"‚ùå Error loading BM25 corpus from {file_path}: {e}")
                self.bm25_corpus = []
        else:
            print(f"‚ö†Ô∏è BM25 corpus file not found at {file_path}. BM25 index will be built during ingestion.")
            self.bm25_corpus = []


    def save_bm25_corpus(self):
        """Saves the tokenized BM25 corpus to a JSON file."""
        try:
            with open(self.bm25_corpus_file, "w") as f:
                json.dump(self.bm25_corpus, f)
            print(f"‚úÖ BM25 corpus saved to {self.bm25_corpus_file}")
        except Exception as e:
            print(f"‚ùå Error saving BM25 corpus: {e}")


**Reasoning**:
Ingest the sample markdown file using the updated `ingest_document` method to test if it processes the markdown file correctly.



In [None]:
qdrant_client = create_qdrant_client()
ingester = DocumentIngester(qdrant_client)

markdown_file_path = "/content/"
ingester.ingest_document(markdown_file_path)

## Refine chunking

### Subtask:
Review and potentially adjust the `extract_and_chunk_text` function or create a new one to ensure it effectively chunks markdown content while preserving relevant structure and associating images.


**Reasoning**:
Examine the current `extract_and_chunk_text` function and consider its suitability for the already cleaned markdown text. Determine if it's sufficient or if minor adjustments are needed for the current level of markdown handling. Based on this analysis, conclude whether the current approach is adequate.



## Enhance markdown image handling

### Subtask:
Implement a method within `DocumentIngester` to identify and process images referenced within markdown files. This might involve extracting image paths or data and generating descriptions similar to PDF image handling.


**Reasoning**:
Implement the `_extract_markdown_images` method in the `DocumentIngester` class to extract image references from markdown text, process the images by generating descriptions, and modify `extract_markdown_content` to integrate this image processing.



In [None]:
import re
import os
from PIL import Image
import io
import fitz
import json
import uuid # Import the uuid library

class DocumentIngester:
    def __init__(self, qdrant_client: QdrantClient):
        self.qdrant_client = qdrant_client
        self.chunks = []
        self.bm25_corpus = []
        self.bm25_index = None
        self.bm25_corpus_file = "bm25_calcom_corpus.json"

        # Delete collection if it exists (keep for fresh start in demo)
        # try:
        #     self.qdrant_client.delete_collection(collection_name=CONFIG["collection_name"])
        #     print(f"‚úÖ Deleted existing collection: {CONFIG['collection_name']}")
        # except Exception as e:
        #     print(f"Collection might not exist or error deleting: {e}")


        # Create collection if it doesn't exist
        try:
            self.qdrant_client.get_collection(collection_name=CONFIG["collection_name"])
            print(f"‚úÖ Collection '{CONFIG['collection_name']}' already exists.")
        except Exception: # Collection does not exist
            try:
                self.qdrant_client.create_collection(
                    collection_name=CONFIG["collection_name"],
                    vectors_config=VectorParams(
                        size=CONFIG["vector_dimension"],
                        distance=Distance.COSINE
                    )
                )
                print(f"‚úÖ Created collection: {CONFIG['collection_name']}")
            except Exception as e:
                print(f"‚ùå Error creating collection: {e}")

        # Attempt to load BM25 corpus
        self.load_bm25_corpus(self.bm25_corpus_file)

        # Rebuild BM25 index if corpus was loaded
        if self.bm25_corpus:
            try:
                self.bm25_index = BM25Okapi(self.bm25_corpus)
                print("‚úÖ BM25 index rebuilt from loaded corpus.")
            except Exception as e:
                print(f"‚ùå Error rebuilding BM25 index from loaded corpus: {e}")
                self.bm25_index = None

    def describe_image_with_gemini(self, image_bytes: bytes) -> str:
        """Generate image description using Gemini Vision."""
        try:
            # Convert bytes to PIL Image
            image = Image.open(io.BytesIO(image_bytes))

            # Initialize Gemini Vision model
            # Ensure you have initialized genai with your API key before calling this function
            model = genai.GenerativeModel('gemini-1.5-flash')

            # prompt = """Describe this image in detail. Focus on:
            # 1. Main objects, people, or elements, or technical diagrams
            # 2. Text content (if any)
            # 3. Charts, diagrams, or technical content
            # 4. Spatial relationships and layout
            # 5. Any relevant contextual information

            # Provide a comprehensive description that would help in document search."""
            prompt = """"Describe the technical diagrams and tables found in the provided document. For each description, follow these steps:

            Purpose: Begin by stating the primary function or purpose of the diagram or table (e.g., 'This is a wiring diagram showing the electrical connections,' or 'This table provides the technical specifications for the product').

            Components: Provide a detailed breakdown of the visual elements.

            For diagrams: List and explain each labeled component, symbol, or annotation. Describe the relationships or processes shown by arrows or other visual cues.

            For tables: Identify and explain the meaning of each column and row. Highlight the key data points, including any values and units of measurement.

            Synthesis: Conclude with a summary of the most important information presented, explaining how the different parts of the diagram or table work together to convey a complete message."""

            response = model.generate_content([prompt, image])
            return response.text
        except Exception as e:
            print(f"Error describing image: {e}")
            return "Image description unavailable"

    def extract_pdf_content(self, pdf_path: str) -> List[DocumentChunk]:
        """Extract text and images from PDF."""
        doc = fitz.open(pdf_path)
        all_chunks = []

        print(f"üìÑ Processing PDF with {min(len(doc), 10)} pages (limited to first 10 for trial)...") # Updated print statement

        # Process only the first 10 pages
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            print(f"Processing page {page_num + 1}...")

            # Extract text
            text = page.get_text()
            if text.strip():
                print(f"Extracted text from page {page_num + 1}. Text length: {len(text)}")
                # Chunk the text
                text_chunks = extract_and_chunk_text(text, CONFIG["chunk_size"], CONFIG["chunk_overlap"])
                print(f"Split into {len(text_chunks)} text chunks on page {page_num + 1}")
                print(f"Debug: text_chunks content for page {page_num + 1}: {text_chunks}, Type: {type(text_chunks)}") # Added debug log


                if text_chunks: # Add check here
                    for i, chunk_text in enumerate(text_chunks):
                        print(f"Debug: Processing text chunk {i} on page {page_num + 1}: {chunk_text[:50]}...") # Added debug log
                        chunk_id = f"page_{page_num}_chunk_{i}"
                        chunk = DocumentChunk(
                            id=chunk_id,
                            text=chunk_text,
                            page_number=page_num,
                            source_type="document",
                            metadata={"pdf_path": pdf_path}
                        )
                        all_chunks.append(chunk)
                else:
                    print(f"‚ö†Ô∏è  No text chunks generated for page {page_num + 1}") # Added log


            # Extract images
            image_list = page.get_images()
            print(f"Found {len(image_list)} images on page {page_num + 1}")
            for img_index, img in enumerate(image_list):
                try:
                    # Get image data
                    xref = img[0]
                    pix = fitz.Pixmap(doc, xref)

                    # Convert pixmap to RGB if it's not already in a compatible format
                    if pix.n > 3: # CMYK or other formats
                         pix = fitz.Pixmap(fitz.csRGB, pix)
                    elif pix.n == 1: # Grayscale
                         pix = fitz.Pixmap(fitz.csRGB, pix) # Convert grayscale to RGB


                    img_data = pix.tobytes("png")

                    # Generate description
                    print(f"Describing image {img_index} on page {page_num + 1}...")
                    description = self.describe_image_with_gemini(img_data) # Use self.describe_image_with_gemini
                    print(f"Image description generated for page {page_num + 1}: {description[:50]}...")

                    # Create image chunk
                    chunk_id = f"page_{page_num}_image_{img_index}"
                    chunk = DocumentChunk(
                        id=chunk_id,
                        text=description,
                        page_number=page_num,
                        source_type="image",
                        image_description=description,
                        image_data=img_data,
                        metadata={"pdf_path": pdf_path}
                    )
                    all_chunks.append(chunk)
                    print(f"üñºÔ∏è  Processed image {img_index} on page {page_num + 1}")

                    pix = None
                except Exception as e:
                    print(f"Error processing image {img_index} on page {page_num}: {e}")

        doc.close()
        print(f"‚úÖ Extracted {len(all_chunks)} chunks from PDF")
        return all_chunks

    def _extract_markdown_images(self, markdown_text: str, md_dir: str) -> List[Tuple[str, bytes, str]]:
        """Extract image data and descriptions from markdown image references."""
        images_info = []
        # Regex to find markdown image references: ![alt text](image_path)
        # It captures alt text (group 1) and image path (group 2)
        image_pattern = re.compile(r'!\[(.*?)\]\((.*?)\)')

        for match in image_pattern.finditer(markdown_text):
            alt_text = match.group(1)
            image_path = match.group(2)

            # Construct absolute image path
            # Handle potential absolute or relative paths
            if os.path.isabs(image_path):
                abs_image_path = image_path
            else:
                abs_image_path = os.path.join(md_dir, image_path)

            print(f"Found image reference: '{image_path}', attempting to load from '{abs_image_path}'")

            # Check if image file exists
            if os.path.exists(abs_image_path):
                try:
                    # Read image data
                    with open(abs_image_path, 'rb') as img_file:
                        img_data = img_file.read()

                    # Generate description
                    description = self.describe_image_with_gemini(img_data)
                    print(f"Generated description for '{image_path}': {description[:50]}...")

                    images_info.append((abs_image_path, img_data, description))
                    print(f"‚úÖ Processed image: {abs_image_path}")

                except Exception as e:
                    print(f"‚ùå Error processing image file '{abs_image_path}': {e}")
            else:
                print(f"‚ö†Ô∏è Image file not found: {abs_image_path}")

        return images_info


    def extract_markdown_content(self, md_path: str) -> List[DocumentChunk]:
        """Extract text and images from markdown files and chunk it."""
        all_chunks = []
        md_dir = os.path.dirname(md_path) # Get directory of the markdown file
        try:
            with open(md_path, 'r', encoding='utf-8') as f:
                markdown_text = f.read()

            # Extract images and generate descriptions first
            images_info = self._extract_markdown_images(markdown_text, md_dir)

            # Create image chunks
            for i, (img_path, img_data, description) in enumerate(images_info):
                chunk_id = f"{os.path.basename(md_path)}_image_{i}"
                image_chunk = DocumentChunk(
                    id=chunk_id,
                    text=description, # Use description as the text for the chunk
                    page_number=0, # Markdown doesn't have pages, use 0 or a unique identifier
                    source_type="image",
                    image_description=description,
                    image_data=img_data, # Store image data in the chunk
                    metadata={"markdown_path": md_path, "image_path": img_path}
                )
                all_chunks.append(image_chunk)
                print(f"üñºÔ∏è  Created image chunk for {os.path.basename(img_path)}")


            # Simple markdown cleaning (remove headers, lists, etc.) - can be improved
            # This is a basic approach; a dedicated markdown parser would be better for complex docs
            cleaned_text = []
            for line in markdown_text.splitlines():
                line = line.strip()
                # Keep lines that are not headers, lists, blockquotes, or horizontal rules
                if line and not line.startswith('#') and not line.startswith('- ') and not line.startswith('* ') and not line.startswith('> ') and not line.startswith('---') and not line.startswith('***') and not line.startswith('___'):
                     cleaned_text.append(line)
            cleaned_text = " ".join(cleaned_text)


            if cleaned_text.strip():
                print(f"Extracted text from markdown {md_path}. Text length: {len(cleaned_text)}")
                # Chunk the text
                text_chunks = extract_and_chunk_text(cleaned_text, CONFIG["chunk_size"], CONFIG["chunk_overlap"])
                print(f"Split into {len(text_chunks)} text chunks from markdown")

                if text_chunks:
                    for i, chunk_text in enumerate(text_chunks):
                        chunk_id = f"{os.path.basename(md_path)}_chunk_{i}"
                        # Markdown doesn't have pages, use 0 or a different indicator for markdown
                        chunk = DocumentChunk(
                            id=chunk_id,
                            text=chunk_text,
                            page_number=0, # Or a different indicator for markdown
                            source_type="markdown",
                            metadata={"markdown_path": md_path}
                        )
                        all_chunks.append(chunk)
                else:
                     print(f"‚ö†Ô∏è No text chunks generated for markdown file {md_path}")

            print(f"‚úÖ Extracted {len(all_chunks)} chunks (text and image) from markdown {md_path}")
            return all_chunks

        except Exception as e:
            print(f"‚ùå Error extracting content from markdown file {md_path}: {e}")
            return []


    def ingest_chunks(self, chunks: List[DocumentChunk]):
        """Ingest chunks into vector database and BM25 index."""
        print("üîÑ Starting ingestion process...")
        print(f"Attempting to ingest {len(chunks)} chunks.")

        points = []
        newly_added_corpus = [] # Use a temporary list for current ingestion's corpus

        for i, chunk in enumerate(chunks):
            print(f"Processing chunk {i+1}/{len(chunks)} (ID: {chunk.id}, Type: {chunk.source_type})")
            # Get embedding
            embedding = get_gemini_embedding_with_retry(chunk.text, "RETRIEVAL_DOCUMENT")
            if embedding is None:
                print(f"‚ö†Ô∏è  Skipping chunk {chunk.id} - failed to get embedding")
                continue
            print(f"‚úÖ Embedding generated for chunk {chunk.id}")

            # Prepare point for Qdrant
            # Generate a UUID for each point to ensure a valid and unique ID format for Qdrant
            unique_point_id = str(uuid.uuid4())


            point = PointStruct(
                id=unique_point_id, # Ensure unique IDs for each chunk across different ingestions
                vector=embedding,
                payload={
                    "chunk_id": chunk.id,
                    "text": chunk.text,
                    "page_number": chunk.page_number,
                    "source_type": chunk.source_type,
                    "image_description": chunk.image_description,
                    "has_image": chunk.image_data is not None,
                    "metadata": chunk.metadata or {}
                }
            )
            points.append(point)
            print(f"‚úÖ Point prepared for chunk {chunk.id}. Total points prepared: {len(points)}")


            # Prepare for BM25 using NLTK and stop words
            try:
                tokens = word_tokenize(chunk.text.lower())
                filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
                newly_added_corpus.append(filtered_tokens)
                print(f"‚úÖ Tokenized and filtered chunk {chunk.id} for BM25.")
            except Exception as e:
                print(f"‚ùå Error tokenizing/filtering chunk {chunk.id} for BM25: {e}")
                newly_added_corpus.append([]) # Add empty list to maintain corpus length


            # Store chunk reference
            # We need to store chunks globally or in the ingester instance
            # to be able to map BM25 results back to chunk objects later in the search engine.
            # Appending to self.chunks here will accumulate chunks across ingestion calls.
            self.chunks.append(chunk) # Append the chunk object
            # print(f"Debug: self.chunks length after adding current chunk: {len(self.chunks)}") # Removed verbose debug


        print(f"Prepared {len(points)} points for upserting to Qdrant.")
        if points:
            # Upsert to Qdrant
            try:
                response = self.qdrant_client.upsert(
                    collection_name=CONFIG["collection_name"],
                    points=points,
                    wait=True # Wait for the operation to complete
                )
                print(f"‚úÖ Qdrant upsert response: {response}")
                print(f"‚úÖ Upserted {len(points)} points to Qdrant.")
            except Exception as e:
                print(f"‚ùå Error during Qdrant upsert: {e}")

        else:
            print("‚ö†Ô∏è No points to upsert to Qdrant.")

        # Accumulate the new corpus
        self.bm25_corpus.extend(newly_added_corpus)

        # print(f"Debug: Corpus content after adding newly added corpus: {self.bm25_corpus}, Type: {type(self.bm25_corpus)}") # Removed verbose debug
        # Build BM25 index from the accumulated corpus
        if self.bm25_corpus and any(self.bm25_corpus): # Check if corpus is not empty and contains non-empty token lists
            self.bm25_index = BM25Okapi(self.bm25_corpus)
            print(f"‚úÖ BM25 index built with {len(self.bm25_corpus)} documents.")

            # Save BM25 corpus to file
            self.save_bm25_corpus() # Call the new method

        else:
            self.bm25_index = None # Ensure index is None if corpus is empty
            self.bm25_corpus = []
            print("‚ö†Ô∏è No corpus for building BM25 index.")


        print(f"‚úÖ Successfully processed {len(points)} chunks for ingestion")


    def ingest_document(self, doc_paths: str | List[str]):
        """Complete ingestion pipeline for a document or list of documents (PDF or Markdown)."""
        if isinstance(doc_paths, str):
            doc_paths = [doc_paths]
            print(f"Processing single document: {doc_paths[0]}")
        elif isinstance(doc_paths, list):
            print(f"Processing {len(doc_paths)} documents from a list.")
        else:
            print(f"‚ùå Unsupported input type for ingestion: {type(doc_paths)}")
            return

        all_chunks = []
        for doc_path in doc_paths:
            try:
                if doc_path.lower().endswith('.pdf'):
                    print(f"Extracting content from PDF: {doc_path}")
                    chunks = self.extract_pdf_content(doc_path)
                elif doc_path.lower().endswith('.md'):
                    print(f"Extracting content from Markdown: {doc_path}")
                    chunks = self.extract_markdown_content(doc_path)
                else:
                    print(f"‚ùå Skipping unsupported file type: {doc_path}")
                    continue
                all_chunks.extend(chunks)
            except Exception as e:
                print(f"‚ùå Error processing document {doc_path}: {e}")

        if all_chunks:
            # Load existing BM25 corpus before ingesting new chunks
            self.load_bm25_corpus(self.bm25_corpus_file)
            self.ingest_chunks(all_chunks)
            self.save_bm25_corpus() # Save the updated corpus after ingestion
        else:
            print("‚ö†Ô∏è No chunks extracted from the provided document(s).")


    def load_bm25_corpus(self, file_path: str):
        """Loads the tokenized BM25 corpus from a JSON file."""
        if os.path.exists(file_path):
            try:
                with open(file_path, "r") as f:
                    loaded_corpus = json.load(f)
                    if isinstance(loaded_corpus, list):
                        self.bm25_corpus = loaded_corpus
                    else:
                        print(f"‚ö†Ô∏è Loaded BM25 corpus from {file_path} is not a list. Initializing with empty corpus.")
                        self.bm25_corpus = []
                print(f"‚úÖ BM25 corpus loaded from {file_path}")
            except Exception as e:
                print(f"‚ùå Error loading BM25 corpus from {file_path}: {e}")
                self.bm25_corpus = []
        else:
            print(f"‚ö†Ô∏è BM25 corpus file not found at {file_path}. BM25 index will be built during ingestion.")
            self.bm25_corpus = []


    def save_bm25_corpus(self):
        """Saves the tokenized BM25 corpus to a JSON file."""
        try:
            with open(self.bm25_corpus_file, "w") as f:
                json.dump(self.bm25_corpus, f)
            print(f"‚úÖ BM25 corpus saved to {self.bm25_corpus_file}")
        except Exception as e:
            print(f"‚ùå Error saving BM25 corpus: {e}")

## Update file selection interface

### Subtask:
Modify the file upload interface to allow selecting a folder instead of just individual files.


**Reasoning**:
Modify the existing file upload interface to allow multiple file selection and update the callback function to handle a list of uploaded files.



In [None]:
def create_upload_interface():
    """Create file upload interface allowing multiple file selection."""
    print("üìÅ Upload PDF or Markdown documents (select multiple files):")

    upload_button = widgets.FileUpload(
        accept='.pdf,.md', # Accept both PDF and Markdown files
        multiple=True,    # Allow multiple file selection
        description='Choose Files'
    )

    def on_upload_change(change):
        if change['new']:
            uploaded_files_info = change['new']
            uploaded_file_paths = []
            for filename, file_info in uploaded_files_info.items():
                # Save each uploaded file
                with open(filename, 'wb') as f:
                    f.write(file_info['content'])
                print(f"‚úÖ Uploaded: {filename}")
                uploaded_file_paths.append(filename)

            print(f"Processing {len(uploaded_file_paths)} files...")
            # Return the list of file paths
            return uploaded_file_paths


    upload_button.observe(on_upload_change, names='value')
    display(upload_button)

    return upload_button

# Update the main execution block to use the modified interface
if __name__ == "__main__":
    # Run the demo
    print("üöÄ Starting Hybrid Multimodal Search Demo")
    print("=" * 50)
    qdrant_client = create_qdrant_client()
    # Initialize system
    ingester = DocumentIngester(qdrant_client)
    # search_engine = HybridSearchEngine(qdrant_client, ingester) # SearchEngine initialization moved after ingestion

    # Create upload interface
    upload_widget = create_upload_interface()

    # Note: The actual ingestion triggered by the upload happens within the on_upload_change callback.
    # We need a way to signal when upload is complete and then initialize the search engine.
    # For this simple demo, the user will need to manually run the next cell after uploading files.

    print("""
üìã INSTRUCTIONS:
1. Upload PDF and/or Markdown files using the file picker above (select multiple files).
2. Wait for processing to complete (watch the output).
3. After uploading and processing, run the *next* code cell to initialize the search engine.
4. Then, run searches using the search_engine.hybrid_search() function.

Example usage:
search_engine.hybrid_search("your query here")
search_engine.display_results(results)
""")


üöÄ Starting Hybrid Multimodal Search Demo
‚úÖ Qdrant client initialized successfully!
‚úÖ Collection 'calcom_help_docs' already exists.
‚ö†Ô∏è BM25 corpus file not found at bm25_calcom_corpus.json. BM25 index will be built during ingestion.
üìÅ Upload PDF or Markdown documents (select multiple files):


FileUpload(value={}, accept='.pdf,.md', description='Choose Files', multiple=True)


üìã INSTRUCTIONS:
1. Upload PDF and/or Markdown files using the file picker above (select multiple files).
2. Wait for processing to complete (watch the output).
3. After uploading and processing, run the *next* code cell to initialize the search engine.
4. Then, run searches using the search_engine.hybrid_search() function.

Example usage:
search_engine.hybrid_search("your query here")
search_engine.display_results(results)



## Testing

### Subtask:
Test the updated ingestion process with a sample markdown file containing images and with a folder containing multiple markdown files to ensure chunks and image information are created correctly and embeddings are generated.


**Reasoning**:
Create a sample markdown file that includes references to images, create sample image files, create a directory for testing, place the files in the directory, create another markdown file without images, initialize the Qdrant client and DocumentIngester, and call the ingest_document method with a list of the markdown file paths.



In [None]:
import os
from PIL import Image


# 6. Initialize the Qdrant client and DocumentIngester
qdrant_client = create_qdrant_client()
ingester = DocumentIngester(qdrant_client)


‚úÖ Qdrant client initialized successfully!
‚úÖ Collection 'calcom_help_docs' already exists.
‚ö†Ô∏è BM25 corpus file not found at bm25_calcom_corpus.json. BM25 index will be built during ingestion.


**Reasoning**:
The previous command failed because the markdown file was not written before attempting to move it. The markdown content should be written to the file first, and then the file should be moved to the test directory along with the images. The code needs to be fixed to write the markdown file before moving it.



## Update demo workflow

### Subtask:
Adjust the `run_demo` function to utilize the updated file selection interface.


**Reasoning**:
Modify the `run_demo` function to use the updated `create_upload_interface` and adjust the user instructions in the main execution block.



In [None]:
def create_upload_interface_and_button():
    """Create file upload interface and a button to trigger ingestion."""
    print("üìÅ Upload PDF or Markdown documents (select multiple files):")

    upload_button = widgets.FileUpload(
        accept='.pdf,.md', # Accept both PDF and Markdown files
        multiple=True,    # Allow multiple file selection
        description='Choose Files'
    )

    ingest_button = widgets.Button(
        description='Start Ingestion',
        disabled=True, # Disable initially until files are uploaded
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Click to start ingestion of selected files'
    )

    output_area = widgets.Output() # Area to display messages


    def on_upload_change(change):
        with output_area:
            print("Debug: on_upload_change triggered") # Debug print
            if change['new']:
                print("Debug: 'change[new]' is not empty. Enabling Ingest button.") # Debug print
                # Files have been selected, enable the ingest button
                ingest_button.disabled = False
                # Clear previous output area content related to upload change
                output_area.clear_output(wait=True)
                print(f"‚úÖ {len(change['new'])} file(s) selected. Click 'Start Ingestion' to proceed.")
            else:
                print("Debug: 'change[new]' is empty. Disabling Ingest button.") # Debug print
                # No files selected, disable the ingest button
                ingest_button.disabled = True
                output_area.clear_output(wait=True)
                print("Please select file(s) to enable ingestion.")


    def on_ingest_button_click(b):
        with output_area:
            print("Debug: on_ingest_button_click triggered") # Debug print
            output_area.clear_output(wait=True) # Clear previous output
            print("üîÑ Starting ingestion process...")
            uploaded_files_info = upload_button.value
            uploaded_file_paths = []
            current_dir = os.getcwd() # Get the current working directory (which is /content/ in Colab)

            if not uploaded_files_info:
                print("‚ö†Ô∏è No files selected for ingestion.")
                return

            # Save files to the current directory and collect paths
            for filename, file_info in uploaded_files_info.items():
                file_path = os.path.join(current_dir, filename)
                try:
                    with open(file_path, 'wb') as f:
                        f.write(file_info['content'])
                    print(f"‚úÖ Uploading and saving: {file_path}")
                    uploaded_file_paths.append(file_path)
                except Exception as e:
                    print(f"‚ùå Error saving file {filename}: {e}")

            print(f"Processing {len(uploaded_file_paths)} files...")
            # Trigger ingestion with the list of successfully uploaded file paths
            if uploaded_file_paths:
                print("Debug: uploaded_file_paths is not empty. Calling ingester.ingest_document.") # Debug print
                try:
                    # Assuming 'ingester' is accessible in this scope (it is if run after run_demo)
                    ingester.ingest_document(uploaded_file_paths)
                    print("‚úÖ Ingestion complete for uploaded files.")
                    # Note: Re-initializing the search engine after ingestion is recommended
                    # but for this demo structure, the user will be instructed to do so manually
                    # after the ingestion callback finishes.
                    print("\nReady to re-initialize the search engine. Please run the next code cell.")
                except NameError:
                    print("‚ùå Ingester not initialized. Please run the setup cells first.")
                except Exception as e:
                    print(f"‚ùå Error during ingestion of uploaded files: {e}")
            else:
                print("‚ö†Ô∏è No files were successfully uploaded for ingestion.")

            # Disable the ingest button after processing
            ingest_button.disabled = True


    upload_button.observe(on_upload_change, names='value')
    ingest_button.on_click(on_ingest_button_click)

    display(upload_button, ingest_button, output_area)

    return upload_button, ingest_button, output_area


# Update the main execution block to use the modified interface
if __name__ == "__main__":
    # Run the demo
    print("üöÄ Starting Hybrid Multimodal Search Demo")
    print("=" * 50)
    qdrant_client = create_qdrant_client()
    # Initialize system
    # ingester needs to be in a scope accessible by the upload widget's callback
    # Let's define it here before creating the widget
    ingester = DocumentIngester(qdrant_client)


    # Create upload interface and button
    upload_widget, ingest_button, output_area = create_upload_interface_and_button()

    # Note: The actual ingestion is now triggered by clicking the "Start Ingestion" button.
    # We instruct the user to re-initialize the search engine manually after ingestion.

    print("""
üìã INSTRUCTIONS:
1. Use the 'Choose Files' button above to select PDF and/or Markdown files (select multiple files).
2. After selecting files, click the 'Start Ingestion' button to begin processing.
3. Wait for processing and ingestion to complete for all uploaded files (watch the output above the button).
4. After ingestion, run the code cell below this one to re-initialize the search engine with the newly ingested data.
5. Then, run searches using the search_engine.hybrid_search() function.

Example usage:
search_engine.hybrid_search("your query here")
search_engine.display_results(results)
""")

üöÄ Starting Hybrid Multimodal Search Demo
‚úÖ Qdrant client initialized successfully!
‚úÖ Collection 'calcom_help_docs' already exists.
‚ö†Ô∏è BM25 corpus file not found at bm25_calcom_corpus.json. BM25 index will be built during ingestion.
üìÅ Upload PDF or Markdown documents (select multiple files):


FileUpload(value={}, accept='.pdf,.md', description='Choose Files', multiple=True)

Button(description='Start Ingestion', disabled=True, style=ButtonStyle(), tooltip='Click to start ingestion of‚Ä¶

Output()


üìã INSTRUCTIONS:
1. Use the 'Choose Files' button above to select PDF and/or Markdown files (select multiple files).
2. After selecting files, click the 'Start Ingestion' button to begin processing.
3. Wait for processing and ingestion to complete for all uploaded files (watch the output above the button).
4. After ingestion, run the code cell below this one to re-initialize the search engine with the newly ingested data.
5. Then, run searches using the search_engine.hybrid_search() function.

Example usage:
search_engine.hybrid_search("your query here")
search_engine.display_results(results)



# Task
Implement batch ingestion for a large number of Markdown files located in a user-selected folder, ensuring the process handles potential runtime disconnections and provides progress feedback.

## Modify ingestion logic for batching

### Subtask:
Update the `on_ingest_button_click` function to divide the list of uploaded file paths into smaller batches.


**Reasoning**:
Update the `on_ingest_button_click` function to implement batch processing for uploaded files, including defining a batch size, iterating through batches, slicing file paths for each batch, calling `ingest_document` with the batch, and adding a time delay.



In [None]:
def create_upload_interface_and_button():
    """Create file upload interface and a button to trigger ingestion."""
    print("üìÅ Upload PDF or Markdown documents (select multiple files):")

    upload_button = widgets.FileUpload(
        accept='.pdf,.md', # Accept both PDF and Markdown files
        multiple=True,    # Allow multiple file selection
        description='Choose Files'
    )

    ingest_button = widgets.Button(
        description='Start Ingestion',
        disabled=True, # Disable initially until files are uploaded
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Click to start ingestion of selected files'
    )

    output_area = widgets.Output() # Area to display messages


    def on_upload_change(change):
        with output_area:
            print("Debug: on_upload_change triggered") # Debug print
            if change['new']:
                print("Debug: 'change[new]' is not empty. Enabling Ingest button.") # Debug print
                # Files have been selected, enable the ingest button
                ingest_button.disabled = False
                # Clear previous output area content related to upload change
                output_area.clear_output(wait=True)
                print(f"‚úÖ {len(change['new'])} file(s) selected. Click 'Start Ingestion' to proceed.")
            else:
                print("Debug: 'change[new]' is empty. Disabling Ingest button.") # Debug print
                # No files selected, disable the ingest button
                ingest_button.disabled = True
                output_area.clear_output(wait=True)
                print("Please select file(s) to enable ingestion.")


    def on_ingest_button_click(b):
        with output_area:
            print("Debug: on_ingest_button_click triggered") # Debug print
            output_area.clear_output(wait=True) # Clear previous output
            print("üîÑ Starting ingestion process...")
            uploaded_files_info = upload_button.value
            uploaded_file_paths = []
            current_dir = os.getcwd() # Get the current working directory (which is /content/ in Colab)

            if not uploaded_files_info:
                print("‚ö†Ô∏è No files selected for ingestion.")
                return

            # Save files to the current directory and collect paths
            for filename, file_info in uploaded_files_info.items():
                file_path = os.path.join(current_dir, filename)
                try:
                    with open(file_path, 'wb') as f:
                        f.write(file_info['content'])
                    print(f"‚úÖ Uploading and saving: {file_path}")
                    uploaded_file_paths.append(file_path)
                except Exception as e:
                    print(f"‚ùå Error saving file {filename}: {e}")

            print(f"Processing {len(uploaded_file_paths)} files...")

            # Implement batch processing
            batch_size = 10  # Define batch size
            num_files = len(uploaded_file_paths)
            num_batches = (num_files + batch_size - 1) // batch_size  # Calculate number of batches

            if not uploaded_file_paths:
                print("‚ö†Ô∏è No files were successfully uploaded for ingestion.")
                # Disable the ingest button after processing
                ingest_button.disabled = True
                return


            print(f"Total files: {num_files}, Batch size: {batch_size}, Number of batches: {num_batches}")

            for i in range(num_batches):
                start_index = i * batch_size
                end_index = min((i + 1) * batch_size, num_files)
                current_batch_paths = uploaded_file_paths[start_index:end_index]

                print(f"\nProcessing batch {i+1}/{num_batches} ({len(current_batch_paths)} files)...")

                try:
                    # Assuming 'ingester' is accessible in this scope (it is if run after run_demo)
                    ingester.ingest_document(current_batch_paths)
                    print(f"‚úÖ Ingestion complete for batch {i+1}.")
                except NameError:
                    print("‚ùå Ingester not initialized. Please run the setup cells first.")
                    break # Stop processing batches if ingester is not available
                except Exception as e:
                    print(f"‚ùå Error during ingestion of batch {i+1}: {e}")
                    # Decide if you want to continue with the next batch or stop
                    # For now, let's continue with the next batch
                    continue

                # Add a small time delay between batches
                if i < num_batches - 1: # Don't wait after the last batch
                    time.sleep(1) # 1-second delay


            print("\n‚úÖ All ingestion batches processed.")
            # Note: Re-initializing the search engine after ingestion is recommended
            # but for this demo structure, the user will be instructed to do so manually
            # after the ingestion callback finishes.
            print("\nReady to re-initialize the search engine. Please run the next code cell.")


            # Disable the ingest button after processing
            ingest_button.disabled = True


    upload_button.observe(on_upload_change, names='value')
    ingest_button.on_click(on_ingest_button_click)

    display(upload_button, ingest_button, output_area)

    return upload_button, ingest_button, output_area

# Update the main execution block to use the modified interface
if __name__ == "__main__":
    # Run the demo
    print("üöÄ Starting Hybrid Multimodal Search Demo")
    print("=" * 50)
    qdrant_client = create_qdrant_client()
    # Initialize system
    # ingester needs to be in a scope accessible by the upload widget's callback
    # Let's define it here before creating the widget
    ingester = DocumentIngester(qdrant_client)


    # Create upload interface and button
    upload_widget, ingest_button, output_area = create_upload_interface_and_button()

    # Note: The actual ingestion is now triggered by clicking the "Start Ingestion" button.
    # We instruct the user to re-initialize the search engine manually after ingestion.

    print("""
üìã INSTRUCTIONS:
1. Use the 'Choose Files' button above to select PDF and/or Markdown files (select multiple files).
2. After selecting files, click the 'Start Ingestion' button to begin processing.
3. Wait for processing and ingestion to complete for all uploaded files (watch the output above the button).
4. After ingestion, run the code cell below this one to re-initialize the search engine with the newly ingested data.
5. Then, run searches using the search_engine.hybrid_search() function.

Example usage:
search_engine.hybrid_search("your query here")
search_engine.display_results(results)
""")

üöÄ Starting Hybrid Multimodal Search Demo
‚úÖ Qdrant client initialized successfully!
‚úÖ Collection 'calcom_help_docs' already exists.
‚ö†Ô∏è BM25 corpus file not found at bm25_calcom_corpus.json. BM25 index will be built during ingestion.
üìÅ Upload PDF or Markdown documents (select multiple files):


FileUpload(value={}, accept='.pdf,.md', description='Choose Files', multiple=True)

Button(description='Start Ingestion', disabled=True, style=ButtonStyle(), tooltip='Click to start ingestion of‚Ä¶

Output()


üìã INSTRUCTIONS:
1. Use the 'Choose Files' button above to select PDF and/or Markdown files (select multiple files).
2. After selecting files, click the 'Start Ingestion' button to begin processing.
3. Wait for processing and ingestion to complete for all uploaded files (watch the output above the button).
4. After ingestion, run the code cell below this one to re-initialize the search engine with the newly ingested data.
5. Then, run searches using the search_engine.hybrid_search() function.

Example usage:
search_engine.hybrid_search("your query here")
search_engine.display_results(results)



## Process batches

### Subtask:
Iterate through the batches and call the `ingest_document` method (or a modified version of it) for each batch.


## Update progress feedback

### Subtask:
Add feedback to the output area to show the progress of batch ingestion (e.g., "Processing batch 1 of 10").


**Reasoning**:
Update the `on_ingest_button_click` function to add feedback to the output area, showing the progress of batch ingestion.



In [None]:
def create_upload_interface_and_button():
    """Create file upload interface and a button to trigger ingestion."""
    print("üìÅ Upload PDF or Markdown documents (select multiple files):")

    upload_button = widgets.FileUpload(
        accept='.pdf,.md', # Accept both PDF and Markdown files
        multiple=True,    # Allow multiple file selection
        description='Choose Files'
    )

    ingest_button = widgets.Button(
        description='Start Ingestion',
        disabled=True, # Disable initially until files are uploaded
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Click to start ingestion of selected files'
    )

    output_area = widgets.Output() # Area to display messages


    def on_upload_change(change):
        with output_area:
            print("Debug: on_upload_change triggered") # Debug print
            if change['new']:
                print("Debug: 'change[new]' is not empty. Enabling Ingest button.") # Debug print
                # Files have been selected, enable the ingest button
                ingest_button.disabled = False
                # Clear previous output area content related to upload change
                output_area.clear_output(wait=True)
                print(f"‚úÖ {len(change['new'])} file(s) selected. Click 'Start Ingestion' to proceed.")
            else:
                print("Debug: 'change[new]' is empty. Disabling Ingest button.") # Debug print
                # No files selected, disable the ingest button
                ingest_button.disabled = True
                output_area.clear_output(wait=True)
                print("Please select file(s) to enable ingestion.")


    def on_ingest_button_click(b):
        with output_area:
            print("Debug: on_ingest_button_click triggered") # Debug print
            output_area.clear_output(wait=True) # Clear previous output
            print("üîÑ Starting ingestion process...")
            uploaded_files_info = upload_button.value
            uploaded_file_paths = []
            current_dir = os.getcwd() # Get the current working directory (which is /content/ in Colab)

            if not uploaded_files_info:
                print("‚ö†Ô∏è No files selected for ingestion.")
                return

            # Save files to the current directory and collect paths
            for filename, file_info in uploaded_files_info.items():
                file_path = os.path.join(current_dir, filename)
                try:
                    with open(file_path, 'wb') as f:
                        f.write(file_info['content'])
                    # Use append_stdout to show continuous output in the widget's output area
                    output_area.append_stdout(f"‚úÖ Uploading and saving: {file_path}\n")
                    uploaded_file_paths.append(file_path)
                except Exception as e:
                    output_area.append_stdout(f"‚ùå Error saving file {filename}: {e}\n")

            output_area.append_stdout(f"\nProcessing {len(uploaded_file_paths)} files...\n")

            # Implement batch processing
            batch_size = 10  # Define batch size
            num_files = len(uploaded_file_paths)
            num_batches = (num_files + batch_size - 1) // batch_size  # Calculate number of batches

            if not uploaded_file_paths:
                output_area.append_stdout("‚ö†Ô∏è No files were successfully uploaded for ingestion.\n")
                # Disable the ingest button after processing
                ingest_button.disabled = True
                return


            output_area.append_stdout(f"Total files: {num_files}, Batch size: {batch_size}, Number of batches: {num_batches}\n")

            for i in range(num_batches):
                start_index = i * batch_size
                end_index = min((i + 1) * batch_size, num_files)
                current_batch_paths = uploaded_file_paths[start_index:end_index]

                # Add feedback for processing each batch
                output_area.append_stdout(f"\nProcessing batch {i+1}/{num_batches} ({len(current_batch_paths)} files)...\n")

                try:
                    # Assuming 'ingester' is accessible in this scope (it is if run after run_demo)
                    ingester.ingest_document(current_batch_paths)
                    output_area.append_stdout(f"‚úÖ Ingestion complete for batch {i+1}.\n")
                except NameError:
                    output_area.append_stdout("‚ùå Ingester not initialized. Please run the setup cells first.\n")
                    break # Stop processing batches if ingester is not available
                except Exception as e:
                    output_area.append_stdout(f"‚ùå Error during ingestion of batch {i+1}: {e}\n")
                    # Decide if you want to continue with the next batch or stop
                    # For now, let's continue with the next batch
                    continue

                # Add a small time delay between batches
                if i < num_batches - 1: # Don't wait after the last batch
                    time.sleep(1) # 1-second delay


            # Add final message after batch processing is complete
            output_area.append_stdout("\n‚úÖ All ingestion batches processed.\n")
            # Note: Re-initializing the search engine after ingestion is recommended
            # but for this demo structure, the user will be instructed to do so manually
            # after the ingestion callback finishes.
            output_area.append_stdout("\nReady to re-initialize the search engine. Please run the next code cell.\n")


            # Disable the ingest button after processing
            ingest_button.disabled = True


    upload_button.observe(on_upload_change, names='value')
    ingest_button.on_click(on_ingest_button_click)

    display(upload_button, ingest_button, output_area)

    return upload_button, ingest_button, output_area

# Update the main execution block to use the modified interface
if __name__ == "__main__":
    # Run the demo
    print("üöÄ Starting Hybrid Multimodal Search Demo")
    print("=" * 50)
    qdrant_client = create_qdrant_client()
    # Initialize system
    # ingester needs to be in a scope accessible by the upload widget's callback
    # Let's define it here before creating the widget
    ingester = DocumentIngester(qdrant_client)


    # Create upload interface and button
    upload_widget, ingest_button, output_area = create_upload_interface_and_button()

    # Note: The actual ingestion is now triggered by clicking the "Start Ingestion" button.
    # We instruct the user to re-initialize the search engine manually after ingestion.

    print("""
üìã INSTRUCTIONS:
1. Use the 'Choose Files' button above to select PDF and/or Markdown files (select multiple files).
2. After selecting files, click the 'Start Ingestion' button to begin processing.
3. Wait for processing and ingestion to complete for all uploaded files (watch the output above the button).
4. After ingestion, run the code cell below this one to re-initialize the search engine with the newly ingested data.
5. Then, run searches using the search_engine.hybrid_search() function.

Example usage:
search_engine.hybrid_search("your query here")
search_engine.display_results(results)
""")

üöÄ Starting Hybrid Multimodal Search Demo
‚úÖ Qdrant client initialized successfully!
‚úÖ Collection 'calcom_help_docs' already exists.
‚úÖ BM25 corpus loaded from bm25_calcom_corpus.json
‚úÖ BM25 index rebuilt from loaded corpus.
üìÅ Upload PDF or Markdown documents (select multiple files):


FileUpload(value={}, accept='.pdf,.md', description='Choose Files', multiple=True)

Button(description='Start Ingestion', disabled=True, style=ButtonStyle(), tooltip='Click to start ingestion of‚Ä¶

Output()


üìã INSTRUCTIONS:
1. Use the 'Choose Files' button above to select PDF and/or Markdown files (select multiple files).
2. After selecting files, click the 'Start Ingestion' button to begin processing.
3. Wait for processing and ingestion to complete for all uploaded files (watch the output above the button).
4. After ingestion, run the code cell below this one to re-initialize the search engine with the newly ingested data.
5. Then, run searches using the search_engine.hybrid_search() function.

Example usage:
search_engine.hybrid_search("your query here")
search_engine.display_results(results)



## Testing with batches

### Subtask:
Test the batch ingestion process with a smaller number of files that can be divided into multiple batches to ensure the logic works correctly.


**Reasoning**:
Create a directory named `test_batch_ingestion`, create several simple markdown files within it with some basic text content, and create sample image files to be referenced in the markdown.



**Reasoning**:
Initialize the Qdrant client and DocumentIngester, then call the `ingest_document` method with the list of created markdown file paths to test the batch ingestion process.



In [None]:
# 6. Initialize the Qdrant client and DocumentIngester
# Assuming create_qdrant_client and DocumentIngester are defined in previous cells and accessible
qdrant_client = create_qdrant_client()
ingester = DocumentIngester(qdrant_client)

# 7. Call the ingest_document method with the list of markdown file paths
# Assuming markdown_file_paths list is available from the previous step
ingester.ingest_document(markdown_file_paths)

## Update demo workflow instructions

### Subtask:
Adjust the instructions for the user to reflect the batch ingestion process.


**Reasoning**:
Modify the instructions in the main execution block to clearly explain the batch ingestion process using the updated interface, guiding the user to select multiple files, click the ingest button, and watch the output for progress.

