In [None]:
# Multi-Document Search Engine with LangChain Router
# Supports: PDF, Word (DOCX), and CSV files

import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [None]:
# Import all necessary LangChain components
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_community.document_loaders import CSVLoader, PyPDFLoader, Docx2txtLoader, DirectoryLoader
from langchain.chains import RetrievalQA
from langchain.chains.router import MultiRetrievalQAChain
from langchain.chains.router.llm_router import LLMRouterChain, RouterOutputParser
from langchain.chains.router.multi_retrieval_prompt import MULTI_RETRIEVAL_ROUTER_TEMPLATE
from langchain.prompts import PromptTemplate
from langchain.schema import Document
import glob
from typing import List, Dict

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    encode_kwargs={"normalize_embeddings": True},  # for cosine similarity
)

In [None]:
# Test the embeddings to make sure they work
test_text = "Hello World, how are you?"
test_embedding = embeddings.embed_query(test_text)
print(f"Embedding dimension: {len(test_embedding)}")
print(f"Sample embedding values: {test_embedding[:5]}")



768
[0.04652441293001175, 0.0034151612780988216, -0.014530838467180729, -0.033341288566589355, 0.03532649949193001]


In [None]:
# Initialize the LLM (used for routing and query answering)
llm = ChatOpenAI(
    temperature=0.0,
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
    model="openai/gpt-oss-20b:free",
)
print("LLM initialized successfully!")

Tokens: ['hello', 'world', ',', 'how', 'are', 'you', '?']
Token IDs: [101, 7592, 2088, 1010, 2129, 2024, 2017, 1029, 102]


In [None]:
# Document Loader Function - Handles PDF, DOCX, and CSV files
def load_documents_by_type(directory: str = ".") -> Dict[str, List[Document]]:
    """
    Load all documents from directory, organized by type
    Returns a dictionary with keys: 'pdf', 'docx', 'csv'
    """
    documents_by_type = {
        'pdf': [],
        'docx': [],
        'csv': []
    }
    
    # Load PDF files
    pdf_files = glob.glob(f"{directory}/*.pdf")
    for pdf_file in pdf_files:
        print(f"Loading PDF: {pdf_file}")
        loader = PyPDFLoader(pdf_file)
        docs = loader.load()
        # Add document type to metadata
        for doc in docs:
            doc.metadata['doc_type'] = 'pdf'
        documents_by_type['pdf'].extend(docs)
    
    # Load DOCX files
    docx_files = glob.glob(f"{directory}/*.docx")
    for docx_file in docx_files:
        print(f"Loading DOCX: {docx_file}")
        loader = Docx2txtLoader(docx_file)
        docs = loader.load()
        # Add document type to metadata
        for doc in docs:
            doc.metadata['doc_type'] = 'docx'
        documents_by_type['docx'].extend(docs)
    
    # Load CSV files
    csv_files = glob.glob(f"{directory}/*.csv")
    for csv_file in csv_files:
        print(f"Loading CSV: {csv_file}")
        loader = CSVLoader(file_path=csv_file)
        docs = loader.load()
        # Add document type to metadata
        for doc in docs:
            doc.metadata['doc_type'] = 'csv'
        documents_by_type['csv'].extend(docs)
    
    return documents_by_type

# Load all documents
all_documents = load_documents_by_type(".")

# Print summary
print("\n=== Document Loading Summary ===")
print(f"PDF documents loaded: {len(all_documents['pdf'])}")
print(f"DOCX documents loaded: {len(all_documents['docx'])}")
print(f"CSV documents loaded: {len(all_documents['csv'])}")
print(f"Total documents: {sum(len(docs) for docs in all_documents.values())}")




In [None]:
# Create separate vector stores for each document type
vector_stores = {}
retrievers = {}

# Create vector store for PDFs (if any exist)
if all_documents['pdf']:
    print("Creating PDF vector store...")
    vector_stores['pdf'] = DocArrayInMemorySearch.from_documents(
        all_documents['pdf'], 
        embeddings
    )
    retrievers['pdf'] = vector_stores['pdf'].as_retriever(search_kwargs={"k": 5})

# Create vector store for DOCX files (if any exist)
if all_documents['docx']:
    print("Creating DOCX vector store...")
    vector_stores['docx'] = DocArrayInMemorySearch.from_documents(
        all_documents['docx'], 
        embeddings
    )
    retrievers['docx'] = vector_stores['docx'].as_retriever(search_kwargs={"k": 5})

# Create vector store for CSV files (if any exist)
if all_documents['csv']:
    print("Creating CSV vector store...")
    vector_stores['csv'] = DocArrayInMemorySearch.from_documents(
        all_documents['csv'], 
        embeddings
    )
    retrievers['csv'] = vector_stores['csv'].as_retriever(search_kwargs={"k": 10})

print(f"\n=== Vector Stores Created ===")
print(f"Active retrievers: {list(retrievers.keys())}")


[Document(metadata={'source': 'sales.csv', 'row': 0}, page_content='Row ID: 1\nOrder ID: CA-2016-152156\nOrder Date: 08-11-2016\nShip Date: 11-11-2016\nShip Mode: Second Class\nCustomer ID: CG-12520\nCustomer Name: Claire Gute\nSegment: Consumer\nCountry: United States\nCity: Henderson\nState: Kentucky\nPostal Code: 42420\nRegion: South\nProduct ID: FUR-BO-10001798\nCategory: Furniture\nSub-Category: Bookcases\nProduct Name: Bush Somerset Collection Bookcase\nSales: 261.96\nQuantity: 2\nDiscount: 0\nProfit: 41.9136'), Document(metadata={'source': 'sales.csv', 'row': 1}, page_content='Row ID: 2\nOrder ID: CA-2016-152156\nOrder Date: 08-11-2016\nShip Date: 11-11-2016\nShip Mode: Second Class\nCustomer ID: CG-12520\nCustomer Name: Claire Gute\nSegment: Consumer\nCountry: United States\nCity: Henderson\nState: Kentucky\nPostal Code: 42420\nRegion: South\nProduct ID: FUR-CH-10000454\nCategory: Furniture\nSub-Category: Chairs\nProduct Name: Hon Deluxe Fabric Upholstered Stacking Chairs, Roun

In [None]:
# Create QA chains for each document type
qa_chains = {}

for doc_type, retriever in retrievers.items():
    print(f"Creating QA chain for {doc_type.upper()} documents...")
    qa_chains[doc_type] = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        verbose=True,
        return_source_documents=True
    )

print(f"\n=== QA Chains Created ===")
print(f"Available chains: {list(qa_chains.keys())}")

In [None]:
# Build Router Chain using LangChain
# The router determines which document type to query based on the user's question

# Define retriever information for the router
retriever_infos = []

if 'pdf' in retrievers:
    retriever_infos.append({
        "name": "pdf",
        "description": "Good for answering questions about PDF documents, reports, articles, papers, documentation, manuals, and textual information stored in PDF format",
        "retriever": retrievers['pdf']
    })

if 'docx' in retrievers:
    retriever_infos.append({
        "name": "docx",
        "description": "Good for answering questions about Word documents, letters, memos, proposals, written content, formatted documents, and business documents stored in DOCX format",
        "retriever": retrievers['docx']
    })

if 'csv' in retrievers:
    retriever_infos.append({
        "name": "csv",
        "description": "Good for answering questions about tabular data, spreadsheets, sales records, orders, numerical data, statistics, transactions, customer data, and structured data stored in CSV format",
        "retriever": retrievers['csv']
    })

print(f"Router configured with {len(retriever_infos)} document types")
print(f"Available document types: {[info['name'] for info in retriever_infos]}")

In [None]:
# Create a custom routing function using LangChain
from langchain.chains.router.base import MultiRouteChain
from langchain.chains.router.llm_router import RouterChain

def create_router_chain():
    """
    Creates a router that uses the LLM to determine which document type to query
    """
    # Create destination chains dictionary
    destination_chains = {}
    for doc_type in qa_chains.keys():
        destination_chains[doc_type] = qa_chains[doc_type]
    
    # Build router prompt
    destinations = []
    for info in retriever_infos:
        destinations.append(f"{info['name']}: {info['description']}")
    
    destinations_str = "\n".join(destinations)
    
    router_template = f"""Given a raw text input to a language model, select the model prompt best suited for the input.
You will be given the names of the available document types and a description of what each type is best suited for.
You may also revise the original input if you think that revising it will ultimately lead to a better response.

<< FORMATTING >>
Return a markdown code snippet with a JSON object formatted to look like:
```json
{{{{
    "destination": string \\ name of the document type to use
    "next_inputs": string \\ the original input
}}}}
```

REMEMBER: "destination" MUST be one of the candidate document types specified below.

<< CANDIDATE DOCUMENT TYPES >>
{destinations_str}

<< INPUT >>
{{input}}

<< OUTPUT >>
"""

    router_prompt = PromptTemplate(
        template=router_template,
        input_variables=["input"]
    )
    
    router_chain = LLMRouterChain.from_llm(
        llm=llm,
        prompt=router_prompt,
        verbose=True
    )
    
    return router_chain, destination_chains

# Create the router
router_chain, destination_chains = create_router_chain()
print("Router chain created successfully!")

In [None]:
# Create Multi-Route Chain - the main query interface
multi_route_chain = MultiRouteChain(
    router_chain=router_chain,
    destination_chains=destination_chains,
    default_chain=list(destination_chains.values())[0] if destination_chains else None,
    verbose=True
)

print("Multi-document search engine ready!")
print(f"\nYou can now query across {len(destination_chains)} document types")
print("The router will automatically determine which document type to search based on your question.")

In [None]:
# Query Function - use this to ask questions
def query_documents(question: str):
    """
    Main query function that routes to appropriate document type
    """
    print(f"\n{'='*60}")
    print(f"QUESTION: {question}")
    print(f"{'='*60}\n")
    
    try:
        # The router will determine which document type to use
        result = multi_route_chain.run(question)
        
        print(f"\n{'='*60}")
        print("ANSWER:")
        print(f"{'='*60}")
        print(result)
        
        return result
    except Exception as e:
        print(f"Error: {str(e)}")
        return None

# Test with example queries
print("Testing the multi-document search engine...\n")

# Example 1: CSV query (if CSV exists)
if 'csv' in destination_chains:
    query1 = "Analyze any three orders and give a short summary"
    response1 = query_documents(query1)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
**Order 1 – Row ID 277 (Order ID CA‑2017‑132976)**  
- **Customer**: Andrew Gjertsen (Corporate, Philadelphia, PA)  
- **Ship Mode**: Standard Class (4‑day transit)  
- **Product**: Post‑it “Important Message” Note Pad (Neon Colors, 50 Sheets/Pad) – Office Supplies → Paper  
- **Quantity**: 2 units  
- **Sales**: $11.65 (after a 20 % discount)  
- **Profit**: $4.08 (≈ 35 % margin)  
- **Observations**: The order is a small, low‑cost office‑supplies purchase with a modest profit. The 20 % discount is typical for corporate orders in this category.

---

**Order 2 – Row ID 200 (Order ID US‑2017‑124303)**  
- **Customer**: Fred Hopkins (Corporate, Philadelphia, PA)  
- **Ship Mode**: Standard Class (4‑day transit)  
- **Product**: Wirebound Message Books (5‑1/2 × 4 Forms) – Office Supplies → Paper  
- **Quantity**: 3 units  
- **Sales**: $16.06 (after a 20 % discount)  
- **Profit**: $5.82 (≈ 36 % margin)  
- **Observ

In [None]:
# Direct Query to Specific Document Type (Advanced Usage)
# If you know which document type you want to query, you can bypass the router

def query_specific_type(question: str, doc_type: str):
    """
    Query a specific document type directly
    doc_type: 'pdf', 'docx', or 'csv'
    """
    if doc_type not in qa_chains:
        print(f"Error: Document type '{doc_type}' not available")
        print(f"Available types: {list(qa_chains.keys())}")
        return None
    
    print(f"\n{'='*60}")
    print(f"Querying {doc_type.upper()} documents directly...")
    print(f"QUESTION: {question}")
    print(f"{'='*60}\n")
    
    try:
        result = qa_chains[doc_type]({"query": question})
        
        print(f"\n{'='*60}")
        print("ANSWER:")
        print(f"{'='*60}")
        print(result['result'])
        
        # Show source documents
        if 'source_documents' in result and result['source_documents']:
            print(f"\n{'='*60}")
            print(f"SOURCES ({len(result['source_documents'])} documents):")
            print(f"{'='*60}")
            for i, doc in enumerate(result['source_documents'][:3], 1):
                print(f"\nSource {i}:")
                print(f"File: {doc.metadata.get('source', 'Unknown')}")
                print(f"Preview: {doc.page_content[:200]}...")
        
        return result
    except Exception as e:
        print(f"Error: {str(e)}")
        return None

# Example: Query CSV directly
# if 'csv' in qa_chains:
#     query_specific_type("What is the average sales amount?", "csv")


In [None]:
# Utility: Check what documents are loaded
def show_document_info():
    """
    Display information about loaded documents
    """
    print("\n" + "="*60)
    print("LOADED DOCUMENTS SUMMARY")
    print("="*60)
    
    for doc_type, docs in all_documents.items():
        if docs:
            print(f"\n{doc_type.upper()} Documents: {len(docs)}")
            # Get unique source files
            sources = set(doc.metadata.get('source', 'Unknown') for doc in docs)
            for source in sources:
                count = sum(1 for doc in docs if doc.metadata.get('source') == source)
                print(f"  - {source}: {count} chunks/rows")
    
    print("\n" + "="*60)
    print(f"Total document chunks: {sum(len(docs) for docs in all_documents.values())}")
    print(f"Active retrievers: {list(retrievers.keys())}")
    print("="*60 + "\n")

# Show what's loaded
show_document_info()
