In [2]:


import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [3]:
# Import all necessary LangChain components
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_community.document_loaders import CSVLoader, PyPDFLoader, Docx2txtLoader, DirectoryLoader
from langchain.chains import RetrievalQA
from langchain.chains.router import MultiRetrievalQAChain
from langchain.chains.router.llm_router import LLMRouterChain, RouterOutputParser
from langchain.chains.router.multi_retrieval_prompt import MULTI_RETRIEVAL_ROUTER_TEMPLATE
from langchain.prompts import PromptTemplate
from langchain.schema import Document
import glob
from typing import List, Dict

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    encode_kwargs={"normalize_embeddings": True},  # for cosine similarity
)




In [5]:
# Test the embeddings to make sure they work
test_text = "Hello World, how are you?"
test_embedding = embeddings.embed_query(test_text)
print(f"Embedding dimension: {len(test_embedding)}")
print(f"Sample embedding values: {test_embedding[:5]}")



Embedding dimension: 768
Sample embedding values: [0.04652441293001175, 0.0034151612780988216, -0.014530838467180729, -0.033341288566589355, 0.03532649949193001]


In [6]:
# Initialize the LLM (used for routing and query answering)
llm = ChatOpenAI(
    temperature=0.0,
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
    model="openai/gpt-oss-20b:free",
)
print("LLM initialized successfully!")

LLM initialized successfully!


In [10]:
# Document Loader Function - Handles PDF, DOCX, and CSV files
def load_documents_by_type(directory: str = ".") -> Dict[str, List[Document]]:
    """
    Load all documents from directory, organized by type
    Returns a dictionary with keys: 'pdf', 'docx', 'csv'
    """
    documents_by_type = {
        'pdf': [],
        'docx': [],
        'csv': []
    }
    
    # Load PDF files
    pdf_files = glob.glob(f"{directory}/*.pdf")
    for pdf_file in pdf_files:
        print(f"Loading PDF: {pdf_file}")
        loader = PyPDFLoader(pdf_file)
        docs = loader.load()
        # Add document type to metadata
        for doc in docs:
            doc.metadata['doc_type'] = 'pdf'
        documents_by_type['pdf'].extend(docs)
    
    # Load DOCX files
    docx_files = glob.glob(f"{directory}/*.docx")
    for docx_file in docx_files:
        print(f"Loading DOCX: {docx_file}")
        loader = Docx2txtLoader(docx_file)
        docs = loader.load()
        # Add document type to metadata
        for doc in docs:
            doc.metadata['doc_type'] = 'docx'
        documents_by_type['docx'].extend(docs)
    
    # Load CSV files
    csv_files = glob.glob(f"{directory}/*.csv")
    for csv_file in csv_files:
        print(f"Loading CSV: {csv_file}")
        loader = CSVLoader(file_path=csv_file)
        docs = loader.load()
        # Add document type to metadata
        for doc in docs:
            doc.metadata['doc_type'] = 'csv'
        documents_by_type['csv'].extend(docs)
    
    return documents_by_type

# Load all documents
all_documents = load_documents_by_type(".")

# Print summary
print("\n=== Document Loading Summary ===")
print(f"Total documents: {sum(len(docs) for docs in all_documents.values())}")


Loading PDF: .\iphone17.pdf
Loading DOCX: .\f1info.docx
Loading CSV: .\sales.csv

=== Document Loading Summary ===
Total documents: 307


In [11]:
# Create separate vector stores for each document type
vector_stores = {}
retrievers = {}

# Create vector store for PDFs (if any exist)
if all_documents['pdf']:
    print("Creating PDF vector store...")
    vector_stores['pdf'] = DocArrayInMemorySearch.from_documents(
        all_documents['pdf'], 
        embeddings
    )
    retrievers['pdf'] = vector_stores['pdf'].as_retriever(search_kwargs={"k": 5})

# Create vector store for DOCX files (if any exist)
if all_documents['docx']:
    print("Creating DOCX vector store...")
    vector_stores['docx'] = DocArrayInMemorySearch.from_documents(
        all_documents['docx'], 
        embeddings
    )
    retrievers['docx'] = vector_stores['docx'].as_retriever(search_kwargs={"k": 5})

# Create vector store for CSV files (if any exist)
if all_documents['csv']:
    print("Creating CSV vector store...")
    vector_stores['csv'] = DocArrayInMemorySearch.from_documents(
        all_documents['csv'], 
        embeddings
    )
    retrievers['csv'] = vector_stores['csv'].as_retriever(search_kwargs={"k": 10})

print(f"\n=== Vector Stores Created ===")
print(f"Active retrievers: {list(retrievers.keys())}")


Creating PDF vector store...




Creating DOCX vector store...
Creating CSV vector store...

=== Vector Stores Created ===
Active retrievers: ['pdf', 'docx', 'csv']


In [19]:
# Wrap QA chains to accept {"input": ...} and forward as {"query": ...}
from langchain_core.runnables import RunnableLambda

wrapped_destinations = {}
for name, chain in qa_chains.items():
    wrapped_destinations[name] = RunnableLambda(
        lambda x, _chain=chain: _chain.invoke({"query": x.get("input", x)})
    )


In [20]:
# Create QA chains for each document type
qa_chains = {}

for doc_type, retriever in retrievers.items():
    print(f"Creating QA chain for {doc_type.upper()} documents...")
    qa_chains[doc_type] = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        verbose=True,
        return_source_documents=True
    )

print(f"\n=== QA Chains Created ===")
print(f"Available chains: {list(qa_chains.keys())}")

Creating QA chain for PDF documents...
Creating QA chain for DOCX documents...
Creating QA chain for CSV documents...

=== QA Chains Created ===
Available chains: ['pdf', 'docx', 'csv']


In [21]:
# Build Router Chain using LangChain
# The router determines which document type to query based on the user's question

# Define retriever information for the router
retriever_infos = []

if 'pdf' in retrievers:
    retriever_infos.append({
        "name": "pdf",
        "description": "The pdf document is all about iphone, iphone 17 series launch.Good for answering questions about PDF documents, reports, articles, papers, documentation, manuals, and textual information stored in PDF format",
        "retriever": retrievers['pdf']
    })

if 'docx' in retrievers:
    retriever_infos.append({
        "name": "docx",
        "description": "The word document is all about F1 Singapore Grand Prix 2025.Good for answering questions about Word documents, letters, memos, proposals, written content, formatted documents, and business documents stored in DOCX format",
        "retriever": retrievers['docx']
    })

if 'csv' in retrievers:
    retriever_infos.append({
        "name": "csv",
        "description": "The csv document is all about sales data of a company.Good for answering questions about tabular data, spreadsheets, sales records, orders, numerical data, statistics, transactions, customer data, and structured data stored in CSV format",
        "retriever": retrievers['csv']
    })

print(f"Router configured with {len(retriever_infos)} document types")
print(f"Available document types: {[info['name'] for info in retriever_infos]}")

Router configured with 3 document types
Available document types: ['pdf', 'docx', 'csv']


In [27]:
# Create a custom routing function using LangChain
from langchain.chains.router.base import MultiRouteChain
from langchain.chains.router.llm_router import RouterChain, RouterOutputParser

def create_router_chain():
    """
    Creates a router that uses the LLM to determine which document type to query
    """
    # Create destination chains dictionary
    destination_chains = {}
    for doc_type in qa_chains.keys():
        destination_chains[doc_type] = qa_chains[doc_type]
    
    # Build router prompt
    destinations = []
    for info in retriever_infos:
        destinations.append(f"{info['name']}: {info['description']}")
    
    destinations_str = "\n".join(destinations)
    
    router_template = f"""Given a raw text input to a language model, select the model prompt best suited for the input.
You will be given the names of the available document types and a description of what each type is best suited for.
You may also revise the original input if you think that revising it will ultimately lead to a better response.

<< FORMATTING >>
Return a markdown code snippet with a JSON object formatted to look like:
```json
{{{{
    "destination": string \\ name of the document type to use
    "next_inputs": string \\ the original input
}}}}
```

REMEMBER: "destination" MUST be one of the candidate document types specified below.

<< CANDIDATE DOCUMENT TYPES >>
{destinations_str}

<< INPUT >>
{{input}}

<< OUTPUT >>
"""

    router_prompt = PromptTemplate(
        template=router_template,
        input_variables=["input"],
        output_parser=RouterOutputParser(),
    )
    
    router_chain = LLMRouterChain.from_llm(
        llm=llm,
        prompt=router_prompt,
        verbose=True
    )
    
    return router_chain, destination_chains

# Create the router
router_chain, destination_chains = create_router_chain()
print("Router chain created successfully!")

Router chain created successfully!


In [29]:
# Create a single router QA chain that handles routing and retrieval
from langchain.chains.router import MultiRetrievalQAChain

router_qa_chain = MultiRetrievalQAChain.from_retrievers(
    llm=llm,
    retriever_infos=retriever_infos,
    default_chain_llm=llm,  # provide default LLM for fallback chain
    verbose=True,
)

print("Router QA chain ready!")
print(f"Available document types: {[info['name'] for info in retriever_infos]}")

Router QA chain ready!
Available document types: ['pdf', 'docx', 'csv']


  validated_self = self.__pydantic_validator__.validate_python(data, self_instance=self)


In [31]:
# ============================================================
# MAIN QUERY FUNCTION - PUT YOUR QUESTION HERE
# ============================================================

def query_documents(user_query: str):
    """
    Main query function - the router automatically determines which document to search
    
    Args:
        user_query: Your question (string)
        
    The router will analyze your question and route it to:
        - PDF documents (for iPhone 17 info)
        - DOCX documents (for F1 Singapore GP info)
        - CSV files (for sales data analysis)
    """
    print(f"\n{'='*60}")
    print(f"USER QUERY: {user_query}")
    print(f"{'='*60}\n")
    
    try:
        # Router automatically picks the right document type
        result = router_qa_chain.invoke({"input": user_query})
        
        # Extract answer from result
        if isinstance(result, str):
            answer = result
        elif isinstance(result, dict):
            answer = result.get("result") or result.get("output") or str(result)
        else:
            answer = str(result)
        
        print(f"\n{'='*60}")
        print("ANSWER:")
        print(f"{'='*60}")
        print(answer)
        print()
        
        return answer
    except Exception as e:
        print(f"Error: {str(e)}")
        import traceback
        traceback.print_exc()
        return None




In [34]:

# YOUR CUSTOM QUERY - Edit and run this cell with your question


# Put your question here (between the quotes):
my_question = "Can you list me five corporate segment orders in the sales data?"

# Run the query
query_documents(my_question)






USER QUERY: Can you list me five corporate segment orders in the sales data?



[1m> Entering new MultiRetrievalQAChain chain...[0m
csv: {'query': 'Can you list me five corporate segment orders in the sales data?'}
[1m> Finished chain.[0m

ANSWER:
Here are five orders that belong to the **Corporate** segment from the data you provided:

| Order ID | Customer Name | Sales | Product Name |
|----------|---------------|-------|--------------|
| **US-2017-124303** | Fred Hopkins | $16.06 | Wirebound Message Books, 5‑1/2 × 4 Forms |
| **CA-2017-132976** | Andrew Gjertsen | $11.65 | Post‑it “Important Message” Note Pad, Neon Colors |
| **US-2017-145366** | Christine Abelman | $57.58 | Recycled Interoffice Envelopes with String and Button Closure |
| **US-2014-100853** | Jennifer Braxton | $52.45 | Kensington 7‑Outlet MasterPiece HomeOffice Power Control Center |
| **US-2014-156216** | Erin Ashbrook | $18.65 | GBC Instant Index System for Binding Systems |

All of these orders are marked 

'Here are five orders that belong to the **Corporate** segment from the data you provided:\n\n| Order ID | Customer Name | Sales | Product Name |\n|----------|---------------|-------|--------------|\n| **US-2017-124303** | Fred Hopkins | $16.06 | Wirebound Message Books, 5‑1/2\u202f×\u202f4 Forms |\n| **CA-2017-132976** | Andrew Gjertsen | $11.65 | Post‑it “Important Message” Note Pad, Neon Colors |\n| **US-2017-145366** | Christine Abelman | $57.58 | Recycled Interoffice Envelopes with String and Button Closure |\n| **US-2014-100853** | Jennifer Braxton | $52.45 | Kensington 7‑Outlet MasterPiece HomeOffice Power Control Center |\n| **US-2014-156216** | Erin Ashbrook | $18.65 | GBC Instant Index System for Binding Systems |\n\nAll of these orders are marked as **Corporate** in the segment field.'

In [None]:
# Direct Query to Specific Document Type (Advanced Usage)
# If you know which document type you want to query, you can bypass the router

def query_specific_type(question: str, doc_type: str):
    """
    Query a specific document type directly
    doc_type: 'pdf', 'docx', or 'csv'
    """
    if doc_type not in qa_chains:
        print(f"Error: Document type '{doc_type}' not available")
        print(f"Available types: {list(qa_chains.keys())}")
        return None
    
    print(f"\n{'='*60}")
    print(f"Querying {doc_type.upper()} documents directly...")
    print(f"QUESTION: {question}")
    print(f"{'='*60}\n")
    
    try:
        result = qa_chains[doc_type]({"query": question})
        
        print(f"\n{'='*60}")
        print("ANSWER:")
        print(f"{'='*60}")
        print(result['result'])
        
        # Show source documents
        if 'source_documents' in result and result['source_documents']:
            print(f"\n{'='*60}")
            print(f"SOURCES ({len(result['source_documents'])} documents):")
            print(f"{'='*60}")
            for i, doc in enumerate(result['source_documents'][:3], 1):
                print(f"\nSource {i}:")
                print(f"File: {doc.metadata.get('source', 'Unknown')}")
                print(f"Preview: {doc.page_content[:200]}...")
        
        return result
    except Exception as e:
        print(f"Error: {str(e)}")
        return None

# Example: Query CSV directly
# if 'csv' in qa_chains:
#     query_specific_type("What is the average sales amount?", "csv")
