Without Langgraph

In [1]:
import PyPDF2
import pytesseract
from PIL import Image
import requests
from bs4 import BeautifulSoup
import re


In [25]:
from openai import OpenAI

client = OpenAI(
    base_url = 'http://localhost:11434/v1',
    api_key='ollama', # required, but unused
)
def llm_call(text,query):
    response = client.chat.completions.create(
        model="llama3.2",
        messages=[
            {"role": "system", "content": "You are a helpful assistant which extract required fields only else NA."},
            {"role": "user", "content": text},
            {"role": "assistant", "content": query}
        ]
    )
    return response.choices[0].message.content

In [2]:
class Tool1:
    def identify_document_type(self, input_data):
        if isinstance(input_data, str):
            if input_data.startswith('http'):
                return 'link'
            elif input_data.endswith('.pdf'):
                return 'pdf'
            elif input_data.endswith(('.png', '.jpg', '.jpeg')):
                return 'image'
            else:
                return 'text'
        else:
            return 'other'


In [16]:

class Tool2:
    def process_document(self, input_data, doc_type):
        if doc_type == 'pdf':
            return self._process_pdf(input_data)
        elif doc_type == 'image':
            return self._process_image(input_data)
        elif doc_type == 'link':
            return self._process_link(input_data)
        elif doc_type == 'text':
            return input_data
        else:
            return None

    def _process_pdf(self, pdf_path):
        text = ""
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text += page.extract_text()
        return text

    def _process_image(self, image_path):
        text = pytesseract.image_to_string(Image.open(image_path))
        return text

    def _process_link(self, url):
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text()



In [12]:
class Tool3:
    def classify_document(self, text):
        if 'invoice' in text.lower():
            return 'invoice'
        elif 'contract' in text.lower():
            return 'contract'
        else:
            return 'other'



In [29]:
class Tool4:
    def extract_fields(self, text, doc_type):
        if doc_type == 'invoice':
            return self._extract_invoice_fields(text)
        elif doc_type == 'contract':
            return self._extract_contract_fields(text)
        else:
            return {}

    def _extract_invoice_fields(self, text):
        fields = {}
        date_match = re.search(r'\b\d{2}/\d{2}/\d{4}\b', text)
        if date_match:
            fields['date'] = date_match.group()
        else:
            fields['date'] = llm_call(text,"Extract the date of this invoice? Give me the date only.")

        invoice_number_match = re.search(r'\bInvoice Number: (\d+)\b', text)
        if invoice_number_match:
            fields['invoice_number'] = invoice_number_match.group(1)
        else:
            fields['invoice_number'] = llm_call(text,"Extract the invoice number? Give me the invoice number only.")

        bill_to_match = re.search(r'\bBill To: (.+?)\b', text)
        if bill_to_match:
            fields['bill_to'] = bill_to_match.group(1)
        else:
            fields['bill_to'] = llm_call(text,"Who is the bill to? Give me the bill to only.")

        ship_to_match = re.search(r'\bShip To: (.+?)\b', text)
        if ship_to_match:
            fields['ship_to'] = ship_to_match.group(1)
        else:
            fields['ship_to'] = llm_call(text,"Who is the ship to? Give me the ship to only.")
        return fields

    def _extract_contract_fields(self, text):
        fields = {}
        fields['contract_date'] = re.search(r'\bContract Date: (\d{2}/\d{2}/\d{4})\b', text).group(1)
        fields['parties'] = re.search(r'\bParties: (.+?)\b', text).group(1)
        return fields



In [31]:
class Tool5:
    def chatbot_query(self, text, query):
        # This is a placeholder for a more complex chatbot implementation
        if query.lower():
            return llm_call(text,query)
        else:
            return "No relevant information found."



In [9]:
class DocumentProcessor:
    def __init__(self):
        self.tool1 = Tool1()
        self.tool2 = Tool2()
        self.tool3 = Tool3()
        self.tool4 = Tool4()
        self.tool5 = Tool5()

    def process_input(self, input_data):
        doc_type = self.tool1.identify_document_type(input_data)
        processed_text = self.tool2.process_document(input_data, doc_type)
        doc_class = self.tool3.classify_document(processed_text)
        extracted_fields = self.tool4.extract_fields(processed_text, doc_class)
        return processed_text, doc_class, extracted_fields

    def query_chatbot(self, text, query):
        return self.tool5.chatbot_query(text, query)


In [34]:
app = DocumentProcessor()
# input_data = '/Users/ajeet/Data/Development/sample_documents/digital.pdf'  
input_data = '/Users/ajeet/Data/Development/sample_documents/sample-invoice.pdf'
processed_text, doc_class, extracted_fields = app.process_input(input_data)
print(f"Document Class: {doc_class}")
print(f"Extracted Fields: {extracted_fields}")

query = "What is the invoice number? Give me the invoice number only."
response = app.query_chatbot(processed_text, query)
print(f"Chatbot Response: {response}")

Document Class: invoice
Extracted Fields: {'date': '1. März 2024', 'invoice_number': ' \n\n 123100401', 'bill_to': ' \n\nMr. John Doe \n Musterstr. 23  \n12345 Musterstadt', 'ship_to': ' \nMusterkunde AG \n\nWhat is the Invoice No.? \nNA \n\nWhere is the company located?\nIm Bruch 3 - 63897 Miltenberg/Main, Germany\n\nWhat are the details of Invoice WMACCESS Internet? \nInvoice Number: 123100401\nInvoiced Date Start:   01.02.2024  \nInvoiced Date End:      29.02.2024'}
Chatbot Response:  

123100401


# LangGraph approach

In [37]:
# main.py
from langgraph.graph import StateGraph, END
from pypdf import PdfReader
from PIL import Image
import pytesseract
import requests
from transformers import pipeline
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
import os

# Set up API key
os.environ["OPENAI_API_KEY"] = "NA"
os.environ["BASE_URL"] = "http://localhost:11434/v1"


In [38]:
### TOOL 1: DocumentTypeIdentifier
def identify_document_type(input_data):
    if input_data.endswith(".pdf"):
        return "pdf"
    elif input_data.endswith(".jpg") or input_data.endswith(".png"):
        return "image"
    elif input_data.startswith("http"):
        return "link"
    elif isinstance(input_data, str):
        return "text"
    else:
        return "other"


In [39]:

### TOOL 2: DocumentProcessor
def process_document(input_data, doc_type):
    if doc_type == "pdf":
        with open(input_data, "rb") as f:
            reader = PdfReader(f)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
    elif doc_type == "image":
        img = Image.open(input_data)
        text = pytesseract.image_to_string(img)
    elif doc_type == "link":
        response = requests.get(input_data)
        text = response.text
    elif doc_type == "text":
        text = input_data
    else:
        text = "Unsupported format"
    return text



In [40]:
### TOOL 3: DocumentClassifier
def classify_document(text):
    classifier = pipeline("text-classification", model="bert-base-uncased")
    categories = ["invoice", "contract", "marksheet", "other"]
    result = classifier(text[:512])  # Process only first 512 tokens
    doc_type = result[0]["label"] if result[0]["label"] in categories else "other"
    return doc_type



In [41]:
### TOOL 4: FieldExtractor
def extract_fields(text, doc_type):
    if doc_type == "invoice":
        fields = {
            "invoice_number": "Extracted invoice number",
            "date": "Extracted date",
            "bill_to": "Extracted billing address",
            "ship_to": "Extracted shipping address",
            "total_amount": "Extracted total amount",
        }
    elif doc_type == "contract":
        fields = {
            "parties": "Extracted parties involved",
            "contract_date": "Extracted contract date",
            "terms": "Extracted contract terms",
        }
    else:
        fields = {"info": "No specific fields identified"}
    return fields



In [42]:
### TOOL 5: DocumentChatbot
def chatbot_query(text):
    llm = ChatOpenAI(model_name="gpt-3.5-turbo")
    index = VectorstoreIndexCreator().from_texts([text])
    qa_chain = RetrievalQA.from_chain_type(llm, retriever=index.vectorstore.as_retriever())
    return qa_chain


In [43]:
### Define State
class DocumentState:
    def __init__(self, input_data=None, doc_type=None, text=None, classified_type=None, fields=None, query=None, response=None):
        self.input_data = input_data
        self.doc_type = doc_type
        self.text = text
        self.classified_type = classified_type
        self.fields = fields
        self.query = query
        self.response = response

### StateGraph Setup
workflow = StateGraph(DocumentState)

# Define Nodes
@workflow.node
def identify_type(state):
    state.doc_type = identify_document_type(state.input_data)
    return state

@workflow.node
def process(state):
    state.text = process_document(state.input_data, state.doc_type)
    return state

@workflow.node
def classify(state):
    state.classified_type = classify_document(state.text)
    return state

@workflow.node
def extract(state):
    state.fields = extract_fields(state.text, state.classified_type)
    return state

@workflow.node
def chat(state):
    if state.query:
        qa_chain = chatbot_query(state.text)
        state.response = qa_chain.run(state.query)
    return state

### Define Workflow Transitions
workflow.add_edge("identify_type", "process")
workflow.add_edge("process", "classify")
workflow.add_edge("classify", "extract")
workflow.add_edge("extract", "chat")
workflow.add_edge("chat", END)


AttributeError: 'StateGraph' object has no attribute 'node'

In [44]:
from typing import Annotated

from langchain_ollama import ChatOllama
from typing_extensions import TypedDict

from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages


class State(TypedDict):
    messages: Annotated[list, add_messages]


app = StateGraph(State)


llm = ChatOllama(model="llama3.2")

def chatbot(state: State):
    return {"messages": [llm.invoke(state["messages"])]}


app.add_node("chatbot", chatbot)

<langgraph.graph.state.StateGraph at 0x134c83ed0>

In [45]:
app = workflow.compile()

ValueError: Graph must have an entrypoint: add at least one edge from START to another node