<a href="https://colab.research.google.com/github/sssangeetha/OutamationAI_OCR_RAG_Automation/blob/main/RAG_using_Llamaindex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import google.generativeai as genai

# Configure your Gemini API key
genai.configure(api_key="your-key")

In [8]:
def classify_query_llm(query: str) -> str:
    """
    Classify a user query into one of the known document types
    using Google's Gemini model.
    """

    # List of known document types
    doc_types = ["pay_stub", "loan_form", "resume", "contract", "w2", "unknown"]

    # Prompt to guide the LLM
    prompt = f"""
    You are a document classification assistant.
    Given the user query below, classify which type of document it refers to.

    Possible document types: {doc_types}

    Query: "{query}"

    Respond with only one label from the list above — do not explain.
    """

    # Run the LLM
    model = genai.GenerativeModel("gemini-1.5-flash")  # lightweight and fast for classification
    response = model.generate_content(prompt)

    # Extract and clean result
    predicted_doc_type = response.text.strip().lower()

    # Validate output (fallback to 'unknown' if invalid)
    if predicted_doc_type not in doc_types:
        predicted_doc_type = "unknown"

    return predicted_doc_type


In [11]:


# -----------------------------------------
# STEP 2: Sample PDF metadata store
# -----------------------------------------
pdf_metadata_store = [
    {
        "file_id": "1",
        "filename": "PayStatement_Nov_2024.pdf",
        "year": "2024",
        "text": "PAY DATE: Nov 1, 2024\nNET PAY $1,201.21\nYEAR TO DATE...",
    },
    {
        "file_id": "2",
        "filename": "Loan_Agreement_2023.pdf",
        "year": "2023",
        "text": "This loan agreement is made between the borrower and lender...",
    },
    {
        "file_id": "3",
        "filename": "Resume_Sangeetha.pdf",
        "year": "2025",
        "text": "Sri Sai Sangeetha Jannapureddy\nSoftware Engineer with experience in React.js and Spring Boot...",
    },
]

# -----------------------------------------
# STEP 3: Define classify_doc_type_llm()
# -----------------------------------------
def classify_doc_type_llm(page_text: str) -> str:
    """
    Classify a page into one of the known document types using Gemini.
    """
    doc_types = ["pay_stub", "loan_form", "resume", "contract", "w2", "unknown"]

    prompt = f"""
    You are a document classifier. Read the page content below and
    determine which type of document it belongs to.

    Possible document types: {doc_types}

    Page text:
    {page_text[:700]}  # limit to first few characters to save tokens

    Respond with only one label from the list above — do not explain.
    """

    model = genai.GenerativeModel("gemini-2.5-flash")
    response = model.generate_content(prompt)
    predicted_type = response.text.strip().lower()

    if predicted_type not in doc_types:
        predicted_type = "unknown"

    return predicted_type


# -----------------------------------------
# STEP 4: Classify each page and update metadata
# -----------------------------------------
for page in pdf_metadata_store:
    page["doc_type"] = classify_doc_type_llm(page["text"])

print("✅ Document types assigned successfully!")
for p in pdf_metadata_store:
    print(f"{p['filename']} → {p['doc_type']}")


# -----------------------------------------
# STEP 5: Filter by predicted query doc_type
# -----------------------------------------
predicted_doc_type = "pay_stub"   # this normally comes from classify_query_llm()

matched_documents = [
    page for page in pdf_metadata_store if page["doc_type"] == predicted_doc_type
]


# -----------------------------------------
# STEP 6: Optional fallback search (if nothing matched)
# -----------------------------------------
if not matched_documents:
    keywords = ["salary", "net pay", "income", "earnings", "compensation"]
    matched_documents = [
        page for page in pdf_metadata_store
        if any(keyword in page["text"].lower() for keyword in keywords)
    ]


# -----------------------------------------
# STEP 7: Final result
# -----------------------------------------
final_output = {
    "query": "What is my monthly salary?",
    "predicted_doc_type": predicted_doc_type,
    "matched_documents": matched_documents
}

print("\n🔍 Final Routing Result:\n", final_output)


✅ Document types assigned successfully!
PayStatement_Nov_2024.pdf → pay_stub
Loan_Agreement_2023.pdf → loan_form
Resume_Sangeetha.pdf → resume

🔍 Final Routing Result:
 {'query': 'What is my monthly salary?', 'predicted_doc_type': 'pay_stub', 'matched_documents': [{'file_id': '1', 'filename': 'PayStatement_Nov_2024.pdf', 'year': '2024', 'text': 'PAY DATE: Nov 1, 2024\nNET PAY $1,201.21\nYEAR TO DATE...', 'doc_type': 'pay_stub'}]}
