In [3]:
# ! pip install langchain-openai
# ! pip install -U langchain
! pip install PyPDF2

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [4]:
from typing import Dict, List, Union
from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
# from langchain_community.utilities import tesseract
from langchain_core.tools import BaseTool
from langchain.agents import AgentExecutor
from langchain.prompts import ChatPromptTemplate
from langchain.schema import SystemMessage
from langgraph.graph import StateGraph, END
import PyPDF2
from pdfminer.high_level import extract_text as pdfminer_extract_text
import os



In [5]:
# ------------------------------------------------------------------------------
# 1. Document Loading
# ------------------------------------------------------------------------------
def load_pdf(file_path: str) -> List[Document]:
    loader = PyPDFLoader(file_path)
    return loader.load()



In [6]:
# ------------------------------------------------------------------------------
# 2. Classifier Tool
# ------------------------------------------------------------------------------
class DocumentClassifierTool(BaseTool):
    name = "document_classifier"
    description = "Classifies if a PDF document is likely digital or non-digital (scanned)."

    def _run(self, file_path: str) -> str:
        try:
            with open(file_path, 'rb') as f:
                pdf_reader = PyPDF2.PdfReader(f)
                for page_num in range(min(5, len(pdf_reader.pages))):  # Check first few pages
                    page = pdf_reader.pages[page_num]
                    if page.extract_text().strip():
                        return "digital"
            return "non_digital"
        except Exception as e:
            print(f"Error during classification: {e}")
            return "unknown"

    async def _arun(self, file_path: str) -> str:
        raise NotImplementedError("This tool does not support asynchronous execution.")

classifier_tool = DocumentClassifierTool()

def classify_document(documents: List[Document]) -> str:
    # For simplicity, let's assume we classify based on the first document's metadata (if available)
    # A more robust approach would involve analyzing the content.
    file_path = documents[0].metadata.get("source")
    if file_path:
        return classifier_tool.run(file_path)
    return "unknown"



PydanticUserError: Field 'name' defined on a base class was overridden by a non-annotated attribute. All field definitions, including overrides, require a type annotation.

For further information visit https://errors.pydantic.dev/2.10/u/model-field-overridden

In [None]:
# ------------------------------------------------------------------------------
# 3. OCR Tool (Tesseract)
# ------------------------------------------------------------------------------
class TesseractOCRTool(BaseTool):
    name = "tesseract_ocr"
    description = "Extracts text from a non-digital PDF document using Tesseract OCR."

    def _run(self, file_path: str) -> str:
        try:
            return tesseract.image_to_string(file_path) # Requires converting PDF to image first
        except Exception as e:
            return f"Error during OCR: {e}"

    async def _arun(self, file_path: str) -> str:
        raise NotImplementedError("This tool does not support asynchronous execution.")

ocr_tool = TesseractOCRTool()

def extract_text_ocr(documents: List[Document]) -> str:
    file_path = documents[0].metadata.get("source")
    if file_path:
        # Need to convert PDF pages to images for Tesseract
        try:
            from pdf2image import convert_from_path
            images = convert_from_path(file_path)
            full_text = ""
            for img in images:
                full_text += ocr_tool.run(img) + "\n\n"
            return full_text.strip()
        except ImportError:
            return "Error: pdf2image library not installed. Please install it (pip install pdf2image)."
        except Exception as e:
            return f"Error during PDF to image conversion or OCR: {e}"
    return ""



In [7]:
# ------------------------------------------------------------------------------
# 4. PDF to Text Tool (Python Library - pdfminer.six)
# ------------------------------------------------------------------------------
def extract_text_digital(documents: List[Document]) -> str:
    file_path = documents[0].metadata.get("source")
    if file_path:
        try:
            return pdfminer_extract_text(file_path)
        except Exception as e:
            return f"Error during digital PDF text extraction: {e}"
    return ""



In [8]:
# ------------------------------------------------------------------------------
# 5. Extraction Tool (OpenAI)
# ------------------------------------------------------------------------------
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

extraction_prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            "You are an expert at extracting general details from documents."
        ),
        ("user", "Extract key information from the following text: {text}"),
    ]
)

extraction_chain = extraction_prompt | llm

def extract_details(text: str) -> str:
    return extraction_chain.invoke({"text": text}).content



In [9]:
# ------------------------------------------------------------------------------
# 6. Final Response
# ------------------------------------------------------------------------------
def format_response(extracted_details: str) -> Dict[str, str]:
    return {"extracted_information": extracted_details}

# ------------------------------------------------------------------------------
# Langgraph State Definition
# ------------------------------------------------------------------------------
class GraphState(TypedDict):
    documents: List[Document]
    classification: str
    extracted_text: str
    extracted_details: str

# ------------------------------------------------------------------------------
# Langgraph Workflow Definition
# ------------------------------------------------------------------------------
builder = StateGraph(GraphState)

# Load the document
builder.add_node("load_document", RunnableLambda(load_pdf))

# Classify the document
builder.add_node("classify", RunnableLambda(classify_document))

# Extract text for non-digital documents (OCR)
builder.add_node("extract_text_ocr", RunnableLambda(extract_text_ocr))

# Extract text for digital documents
builder.add_node("extract_text_digital", RunnableLambda(extract_text_digital))

# Extract general details using OpenAI
builder.add_node("extract_details", RunnableLambda(extract_details))

# Format the final response
builder.add_node("format_response", RunnableLambda(format_response))

# Define edges
builder.set_entry_point("load_document")

builder.add_edge("load_document", "classify")

# Conditional routing based on classification
def should_use_ocr(state):
    return state["classification"] == "non_digital"

def should_use_digital_extraction(state):
    return state["classification"] == "digital"

builder.add_conditional_edges(
    "classify",
    {
        "non_digital": "extract_text_ocr",
        "digital": "extract_text_digital",
        "unknown": "extract_text_digital", # Default to digital if unknown
    },
    should_use_ocr,
)

builder.add_edge("extract_text_ocr", "extract_details")
builder.add_edge("extract_text_digital", "extract_details")
builder.add_edge("extract_details", "format_response")
builder.add_edge("format_response", END)

# Compile the graph
graph = builder.compile()



NameError: name 'TypedDict' is not defined

In [None]:
# ------------------------------------------------------------------------------
# Example Usage
# ------------------------------------------------------------------------------
if __name__ == "__main__":
    # Create a dummy digital PDF file for testing
    with open("digital_document.pdf", "w") as f:
        f.write("This is a sample digital PDF document.\nIt contains some text.")

    # Create a dummy non-digital PDF file (you'd typically have an actual scanned PDF)
    # For this example, we'll just point to an image file that Tesseract can process.
    # You'll need to have Tesseract installed and configured.
    # You might need to manually create a simple image file (e.g., a screenshot) named "scanned_document.png"
    # and then "convert" it to a single-page PDF named "non_digital_document.pdf".
    # Example using ImageMagick (command line): `convert scanned_document.png non_digital_document.pdf`
    if not os.path.exists("non_digital_document.pdf"):
        print("Please create a 'non_digital_document.pdf' (e.g., from a scanned image) for full testing.")

    # Test with a digital document
    print("--- Processing Digital Document ---")
    result_digital = graph.invoke({"documents": [{"metadata": {"source": "digital_document.pdf"}}]})
    print(result_digital)

    # Test with a non-digital document (if the file exists)
    if os.path.exists("non_digital_document.pdf"):
        print("\n--- Processing Non-Digital Document ---")
        result_non_digital = graph.invoke({"documents": [{"metadata": {"source": "non_digital_document.pdf"}}]})
        print(result_non_digital)
    else:
        print("\nSkipping non-digital document test as 'non_digital_document.pdf' was not found.")

    # Clean up dummy files
    os.remove("digital_document.pdf")
    if os.path.exists("non_digital_document.pdf"):
        # Be cautious about deleting actual scanned documents
        pass # os.remove("non_digital_document.pdf")