## Phase 1: Indexing (One-time Setup)

This phase processes your PDF documents and creates the searchable vector database.
Run these cells once to index your documents, then use Phase 2 for queries.

In [1]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_core.documents import Document
from collections import defaultdict

# Configuration
DATA_DIR = "../Data/"
OLLAMA_BASE_URL = "http://localhost:11434"
OLLAMA_EMBEDDING_MODEL = "mxbai-embed-large:335m"

# Load all PDFs from data directory
loader = PyPDFDirectoryLoader(DATA_DIR)
raw_documents = loader.load()
print(f"Loaded {len(raw_documents)} pages from PDFs")

Loaded 400 pages from PDFs


### Step 2: Merge pages into parent documents

In [2]:
# Group pages by source file
docs_by_source = defaultdict(list)
for doc in raw_documents:
    source = doc.metadata.get('source', 'unknown')
    docs_by_source[source].append(doc)

# Create one parent document per PDF file
documents = []
for source, pages in docs_by_source.items():
    combined_content = "\n\n".join([page.page_content for page in pages])
    metadata = pages[0].metadata.copy()
    metadata['total_pages'] = len(pages)
    
    parent_doc = Document(
        page_content=combined_content,
        metadata=metadata
    )
    documents.append(parent_doc)

print(f"Created {len(documents)} parent documents (1 per PDF file)")

Created 200 parent documents (1 per PDF file)


### Step 3: Set up embeddings and vector store

In [3]:
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings

CHROMA_DIR = "../backend/chroma_db"
PARENT_STORE_DIR = "../backend/parent_docs"

# Set up persistent storage with Ollama embeddings
embeddings = OllamaEmbeddings(
    model=OLLAMA_EMBEDDING_MODEL,
    base_url=OLLAMA_BASE_URL
)

vectorstore = Chroma(
    collection_name="records",
    embedding_function=embeddings,
    persist_directory=CHROMA_DIR
)

### Step 4: Configure retriever

In [4]:
from langchain.storage import LocalFileStore, create_kv_docstore
from langchain.retrievers import ParentDocumentRetriever
from langchain_text_splitters import RecursiveCharacterTextSplitter

parent_store = LocalFileStore(PARENT_STORE_DIR)
docstore = create_kv_docstore(parent_store)

# Configure retriever
child_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50
)

retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    child_splitter=child_splitter,
    parent_splitter=None
)

### Step 5: Index documents

In [5]:
# Index all documents
print("Starting indexing...")
retriever.add_documents(documents, ids=None)
print(f"Indexed {len(documents)} parent documents")

Starting indexing...
Indexed 200 parent documents


### Step 6: Verify indexing

In [6]:
# Check what got indexed
num_children = len(vectorstore.get()['ids'])
print(f"Created {num_children} child chunks from {len(documents)} parent documents")
print("Indexing complete!")

Created 1598 child chunks from 200 parent documents
Indexing complete!


## Phase 2: Retrieval (Query Existing Index)

In [7]:
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings

# Configuration
CHROMA_DIR = "../backend/chroma_db"
PARENT_STORE_DIR = "../backend/parent_docs"
OLLAMA_BASE_URL = "http://localhost:11434"
OLLAMA_EMBEDDING_MODEL = "mxbai-embed-large:335m"

print("Loading existing index for retrieval...")

# Connect to existing persistent stores
embeddings = OllamaEmbeddings(
    model=OLLAMA_EMBEDDING_MODEL,
    base_url=OLLAMA_BASE_URL
)

vectorstore = Chroma(
    collection_name="records",
    embedding_function=embeddings,
    persist_directory=CHROMA_DIR
)

Loading existing index for retrieval...


### Step 2: Configure retriever

In [8]:
from langchain.storage import LocalFileStore, create_kv_docstore
from langchain.retrievers import ParentDocumentRetriever
from langchain_text_splitters import RecursiveCharacterTextSplitter

parent_store = LocalFileStore(PARENT_STORE_DIR)
docstore = create_kv_docstore(parent_store)

child_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50
)

retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    child_splitter=child_splitter,
    parent_splitter=None
)

print("Index loaded successfully!")

Index loaded successfully!


### Step 3: Query and retrieve document

In [9]:
query = "Who is arjun chowdhury?"
print(f"Query: '{query}'")

try:
    results = retriever.invoke(query)
    results = results[:1]

    if results:
        print(f"Retrieved {len(results)} document")

        doc = results[0]
        print(f"Source File: {doc.metadata.get('source', 'Unknown')}")
        print(f"Total Pages: {doc.metadata.get('total_pages', 'Unknown')}")
        print(f"Total Content Size: {len(doc.page_content)} characters")
        print("Document content retrieved successfully")
    else:
        print("No relevant documents found for the query.")

except Exception as e:
    print(f"Error during retrieval: {e}")
    print("Check that Ollama is running and accessible.")

Query: 'Who is arjun chowdhury?'
Retrieved 1 document
Source File: ..\Data\Data\IR_Case_2.pdf
Total Pages: 2
Total Content Size: 2417 characters
Document content retrieved successfully


## Usage Instructions

1. **First time setup**: Run Phase 1 cells (Steps 1-6) to index your documents
2. **For queries**: Run Phase 2 cells (Steps 1-3) and modify the query variable
3. **Re-indexing**: Only run Phase 1 again if you add new documents

This setup ensures:
- Documents are processed only once
- Fast query responses
- Persistent storage across sessions
- Parent-child document retrieval for better context