In [1]:
#load visual language model subandhu
# write code for reading document and converting the pages to image aditya

In [6]:
import fitz  # PyMuPDF
import os
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain_core.tools import BaseTool
from langchain_core.agents import BaseAgent
from langchain_core.messages import HumanMessage
from langgraph.graph import StateGraph, END
from typing import TypedDict, Annotated, Sequence
import operator
import pandas as pd
import json

# Initialize the model and tokenizer
model_id = "vikhyatk/moondream2"
revision = "2024-05-20"
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, revision=revision)
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)

# Step 1: Convert PDF to Images
def convert_pdf_to_images(pdf_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    pdf_document = fitz.open(pdf_path)
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap()
        image_path = os.path.join(output_folder, f"page_{page_num + 1}.png")
        pix.save(image_path)
    return [os.path.join(output_folder, f"page_{i + 1}.png") for i in range(len(pdf_document))]

# Step 2: Define Tools
class AadhaarTool(BaseTool):
    name = "aadhaar_extraction"
    description = "Extracts address information from Aadhaar card images."

    def __call__(self, image_path: str) -> dict:
        image = Image.open(image_path)
        enc_image = model.encode_image(image)
        result = model.answer_question(enc_image, "Extract address from this Aadhaar card.", tokenizer)
        return {"address": result}

class PanTool(BaseTool):
    name = "pan_extraction"
    description = "Extracts identity information from PAN card images."

    def __call__(self, image_path: str) -> dict:
        image = Image.open(image_path)
        enc_image = model.encode_image(image)
        result = model.answer_question(enc_image, "Extract name and PAN number from this PAN card.", tokenizer)
        return {"identity_info": result}

class VoterIDTool(BaseTool):
    name = "voterid_extraction"
    description = "Extracts identity information from Voter ID images."

    def __call__(self, image_path: str) -> dict:
        image = Image.open(image_path)
        enc_image = model.encode_image(image)
        result = model.answer_question(enc_image, "Extract name and Voter ID from this Voter ID card.", tokenizer)
        return {"identity_info": result}

class AttributeTool(BaseTool):
    name = "attribute_extraction"
    description = "Extracts specified attributes from the document images based on the prompt."

    def __call__(self, image_path: str, attributes: list) -> dict:
        image = Image.open(image_path)
        enc_image = model.encode_image(image)
        results = {}
        for attr in attributes:
            result = model.answer_question(enc_image, f"Extract {attr} from this document.", tokenizer)
            results[attr] = result
        return results

# Step 3: Define Agents
class ProofOfAddressAgent(BaseAgent):
    def __init__(self, tool):
        self.tool = tool

    def __call__(self, state):
        image_path = state["image_path"]
        address_info = self.tool(image_path)
        return {"address_info": address_info}

class ProofOfIdentityAgent(BaseAgent):
    def __init__(self, tools):
        self.tools = tools

    def __call__(self, state):
        image_path = state["image_path"]
        for tool in self.tools:
            identity_info = tool(image_path)
            if identity_info:
                return {"identity_info": identity_info}
        return {"identity_info": None}

class AttributeExtractionAgent(BaseAgent):
    def __init__(self, tool):
        self.tool = tool

    def __call__(self, state):
        image_path = state["image_path"]
        attributes = state["attributes"]
        extracted_attributes = self.tool(image_path, attributes)
        return {"extracted_attributes": extracted_attributes}

class ReviewerAgent(BaseAgent):
    def __call__(self, state):
        address_info = state.get("address_info")
        identity_info = state.get("identity_info")
        extracted_attributes = state.get("extracted_attributes")

        if address_info and identity_info and extracted_attributes:
            data = {**address_info, **identity_info, **extracted_attributes}
            df = pd.DataFrame([data])
            return {"dataframe": df}
        else:
            return {"error": "Missing information"}

# Step 4: Integrate Agents into LangGraph
class AgentState(TypedDict):
    image_path: str
    attributes: list
    address_info: dict
    identity_info: dict
    extracted_attributes: dict

def call_proof_of_address(state):
    agent = ProofOfAddressAgent(AadhaarTool())
    return agent(state)

def call_proof_of_identity(state):
    agent = ProofOfIdentityAgent([PanTool(), VoterIDTool()])
    return agent(state)

def call_attribute_extraction(state):
    agent = AttributeExtractionAgent(AttributeTool())
    return agent(state)

def call_reviewer(state):
    agent = ReviewerAgent()
    return agent(state)

workflow = StateGraph(AgentState)
workflow.add_node("proof_of_address", call_proof_of_address)
workflow.add_node("proof_of_identity", call_proof_of_identity)
workflow.add_node("attribute_extraction", call_attribute_extraction)
workflow.add_node("reviewer", call_reviewer)
workflow.set_entry_point("proof_of_address")

workflow.add_edge("proof_of_address", "proof_of_identity")
workflow.add_edge("proof_of_identity", "attribute_extraction")
workflow.add_edge("attribute_extraction", "reviewer")
workflow.add_edge("reviewer", END)

app = workflow.compile()

# Step 5: Process Each Page of the Document
pdf_path = r"Docs\BGQPK4512E_31012024103628.pdf" 
output_folder = "output_images"
image_paths = convert_pdf_to_images(pdf_path, output_folder)
attributes_to_extract = ["DOB", "Issue Date"]

all_dataframes = []
for image_path in image_paths:
    inputs = {
        "image_path": image_path,
        "attributes": attributes_to_extract
    }
    output = app.invoke(inputs)
    if "dataframe" in output:
        all_dataframes.append(output["dataframe"])

# Combine all dataframes
final_dataframe = pd.concat(all_dataframes, ignore_index=True)
print(final_dataframe)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


NameError: name 'BaseAgent' is not defined

In [6]:
import fitz  # PyMuPDF
from PIL import Image
import os
from langgraph.graph import StateGraph, END
from typing import TypedDict, List, Annotated
import operator

class AgentState(TypedDict):
    pdf_path: str
    output_folder: str
    storage_folder: str
    image_paths: Annotated[List[str], operator.add]

def convert_pdf_to_images(pdf_path, output_folder):
    pdf_document = fitz.open(pdf_path)
    image_paths = []
    
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap()
        output_path = os.path.join(output_folder, f'page_{page_num + 1}.png')
        pix.save(output_path)
        image_paths.append(output_path)
    
    return image_paths

def store_images(image_paths, storage_folder):
    if not os.path.exists(storage_folder):
        os.makedirs(storage_folder)
    
    for image_path in image_paths:
        image = Image.open(image_path)
        image.save(os.path.join(storage_folder, os.path.basename(image_path)))
    
    return {"status": "success", "stored_images": len(image_paths)}

def call_convert_pdf_to_images(state):
    pdf_path = state['pdf_path']
    output_folder = state['output_folder']
    image_paths = convert_pdf_to_images(pdf_path, output_folder)
    return {"image_paths": image_paths}

def call_store_images(state):
    image_paths = state['image_paths']
    storage_folder = state['storage_folder']
    result = store_images(image_paths, storage_folder)
    return result

workflow = StateGraph(AgentState)

workflow.add_node("convert_pdf_to_images", call_convert_pdf_to_images)
workflow.add_node("store_images", call_store_images)

workflow.set_entry_point("convert_pdf_to_images")
workflow.add_edge("convert_pdf_to_images", "store_images")
workflow.set_finish_point("store_images")

app = workflow.compile()

inputs = {
    "pdf_path": r"Docs\BGQPK4512E_31012024103628.pdf",
    "output_folder": r"output_images",
    "storage_folder": r"storage",
    "image_paths": []
}

result = app.invoke(inputs)
print(result)


{'pdf_path': 'C:\\Users\\KIIT\\Desktop\\who\\work\\skive.ai\\BGQPK4512E_31012024103628.pdf', 'output_folder': 'C:\\Users\\KIIT\\Desktop\\who\\work\\skive.ai', 'storage_folder': 'C:\\Users\\KIIT\\Desktop\\who\\work\\skive.ai', 'image_paths': ['C:\\Users\\KIIT\\Desktop\\who\\work\\skive.ai\\page_1.png', 'C:\\Users\\KIIT\\Desktop\\who\\work\\skive.ai\\page_2.png', 'C:\\Users\\KIIT\\Desktop\\who\\work\\skive.ai\\page_3.png', 'C:\\Users\\KIIT\\Desktop\\who\\work\\skive.ai\\page_4.png', 'C:\\Users\\KIIT\\Desktop\\who\\work\\skive.ai\\page_5.png', 'C:\\Users\\KIIT\\Desktop\\who\\work\\skive.ai\\page_6.png', 'C:\\Users\\KIIT\\Desktop\\who\\work\\skive.ai\\page_7.png', 'C:\\Users\\KIIT\\Desktop\\who\\work\\skive.ai\\page_8.png']}


In [11]:
import fitz  # PyMuPDF
from PIL import Image
import os
from typing import TypedDict, List, Annotated, Dict
import operator
from transformers import AutoModelForCausalLM, AutoTokenizer

# Ensure required directories exist
def ensure_directory_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

# Convert PDF pages to images
def convert_pdf_to_images(pdf_path, output_folder):
    ensure_directory_exists(output_folder)
    pdf_document = fitz.open(pdf_path)
    image_paths = []
    
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap()
        output_path = os.path.join(output_folder, f'page_{page_num + 1}.png')
        pix.save(output_path)
        image_paths.append(output_path)
    
    return image_paths

# Store images in the specified folder
def store_images(image_paths, storage_folder):
    ensure_directory_exists(storage_folder)
    
    for image_path in image_paths:
        image = Image.open(image_path)
        image.save(os.path.join(storage_folder, os.path.basename(image_path)))
    
    return {"status": "success", "stored_images": len(image_paths)}

# Reviewer agent: Extract information from each page based on user query
def review_images(image_paths, query):
    model_id = "vikhyatk/moondream2"
    revision = "2024-07-23"
    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, revision=revision)
    tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
    
    results = {}
    for image_path in image_paths:
        image = Image.open(image_path)
        enc_image = model.encode_image(image)
        response = model.answer_question(enc_image, query, tokenizer)
        results[image_path] = response
    
    return results

class AgentState(TypedDict):
    pdf_path: str
    output_folder: str
    storage_folder: str
    query: str
    image_paths: Annotated[List[str], operator.add]
    review_results: Dict[str, str]

def call_convert_pdf_to_images(state):
    pdf_path = state['pdf_path']
    output_folder = state['output_folder']
    image_paths = convert_pdf_to_images(pdf_path, output_folder)
    return {"image_paths": image_paths}

def call_store_images(state):
    image_paths = state['image_paths']
    storage_folder = state['storage_folder']
    result = store_images(image_paths, storage_folder)
    return result

def call_review_images(state):
    image_paths = state['image_paths']
    query = state['query']
    review_results = review_images(image_paths, query)
    return {"image_paths": image_paths}

from langgraph.graph import StateGraph, END

workflow = StateGraph(AgentState)

workflow.add_node("convert_pdf_to_images", call_convert_pdf_to_images)
workflow.add_node("store_images", call_store_images)
workflow.add_node("review_images", call_review_images)

workflow.set_entry_point("convert_pdf_to_images")
workflow.add_edge("convert_pdf_to_images", "store_images")
workflow.add_edge("store_images", "review_images")
workflow.set_finish_point("review_images")

app = workflow.compile()
prompt=
inputs = {
    "pdf_path": r"Docs\BGQPK4512E_31012024103628.pdf",
    "output_folder": r"output_images",
    "storage_folder": r"output_images",
    "query": "Describe the main topic of this page.",
    "image_paths": [],
    "review_results": {}
}

result = app.invoke(inputs)
print(result)

SyntaxError: invalid syntax (3467332613.py, line 94)

In [14]:
import fitz  # PyMuPDF
from PIL import Image
import os
import pandas as pd
from typing import TypedDict, List, Annotated, Dict
import operator
from transformers import AutoModelForCausalLM, AutoTokenizer

# Ensure required directories exist
def ensure_directory_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

# Convert PDF pages to images
def convert_pdf_to_images(pdf_path, output_folder):
    ensure_directory_exists(output_folder)
    pdf_document = fitz.open(pdf_path)
    image_paths = []
    
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap()
        output_path = os.path.join(output_folder, f'page_{page_num + 1}.png')
        pix.save(output_path)
        image_paths.append(output_path)
    
    return image_paths

# Store images in the specified folder
def store_images(image_paths, storage_folder):
    ensure_directory_exists(storage_folder)
    
    for image_path in image_paths:
        image = Image.open(image_path)
        image.save(os.path.join(storage_folder, os.path.basename(image_path)))
    
    return {"status": "success", "stored_images": len(image_paths)}

# Reviewer agent: Extract information from each page based on user query
def review_images(image_paths, query):
    model_id = "vikhyatk/moondream2"
    revision = "2024-07-23"
    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, revision=revision)
    tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
    
    results = {}
    for image_path in image_paths:
        image = Image.open(image_path)
        enc_image = model.encode_image(image)
        response = model.answer_question(enc_image, query, tokenizer)
        results[image_path] = response
    
    return results

# Generate the detailed query
def generate_query():
    attributes = [
        "PAN Copy Present or not",
        "PAN Number",
        "Name",
        "Gender",
        "POA Copy Present or not",
        "Type - UID, voter ID, Passport, Driver Licence",
        "Name",
        "Proof Number",
        "Download date",
        "Gender",
        "c/o",
        "Address [Pincode, State, City, locality, Landmark]",
        "Signature present or not",
        "Document",
        "KYC Mode",
        "Applicant Type",
        "Name",
        "Maiden Name",
        "Father’s Name",
        "DOB",
        "Gender",
        "Occupation",
        "Residential status",
        "Application Number",
        "Marital Status",
        "PAN Number",
        "Nationality",
        "Citizenship",
        "Address type",
        "Proof of address : Type",
        "Residential Address {Line 123, state, country, pincode, city}",
        "Email",
        "Mobile",
        "Corresponding address",
        "Name & E code",
        "Designation",
        "Date",
        "Intermediary Date",
        "Esign Date"
    ]
    
    query = "Please check if the following attributes are present in this document and extract the information: " + ", ".join(attributes)
    return query

class AgentState(TypedDict):
    pdf_path: str
    output_folder: str
    storage_folder: str
    query: str
    image_paths: Annotated[List[str], operator.add]
    review_results: Dict[str, str]

def call_convert_pdf_to_images(state):
    pdf_path = state['pdf_path']
    output_folder = state['output_folder']
    image_paths = convert_pdf_to_images(pdf_path, output_folder)
    return {"image_paths": image_paths}

def call_store_images(state):
    image_paths = state['image_paths']
    storage_folder = state['storage_folder']
    result = store_images(image_paths, storage_folder)
    return result

def call_review_images(state):
    image_paths = state['image_paths']
    query = state['query']
    review_results = review_images(image_paths, query)
    return {"image_paths": image_paths}

from langgraph.graph import StateGraph, END

workflow = StateGraph(AgentState)

workflow.add_node("convert_pdf_to_images", call_convert_pdf_to_images)
workflow.add_node("store_images", call_store_images)
workflow.add_node("review_images", call_review_images)

workflow.set_entry_point("convert_pdf_to_images")
workflow.add_edge("convert_pdf_to_images", "store_images")
workflow.add_edge("store_images", "review_images")
workflow.set_finish_point("review_images")

app = workflow.compile()

inputs = {
    "pdf_path": r"Docs\BGQPK4512E_31012024103628.pdf",
    "output_folder": r"output_images",
    "storage_folder": r"output_images",
    "query": generate_query(),
    "image_paths": [],
    "review_results": {}
}

result = app.invoke(inputs)

# Save the review results to a DataFrame
review_results = result["review_results"]
df = pd.DataFrame.from_dict(review_results, orient='index')
df.reset_index(inplace=True)
df.rename(columns={'index': 'Image Path', 0: 'Extracted Information'}, inplace=True)

# Save DataFrame to a CSV file
df.to_csv('/path/to/output/folder/extracted_information.csv', index=False)

print("Extraction complete. Results saved to extracted_information.csv")

PermissionError: [WinError 5] Access is denied: 'C:\\Users\\KIIT'

In [17]:
import fitz  # PyMuPDF
from PIL import Image
import os
import pandas as pd
import logging
from typing import TypedDict, List, Annotated, Dict
import operator
from transformers import AutoModelForCausalLM, AutoTokenizer

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Ensure required directories exist
def ensure_directory_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        logging.info(f"Directory created: {directory}")
    else:
        logging.info(f"Directory already exists: {directory}")

# Convert PDF pages to images
def convert_pdf_to_images(pdf_path, output_folder):
    ensure_directory_exists(output_folder)
    pdf_document = fitz.open(pdf_path)
    image_paths = []
    
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap()
        output_path = os.path.join(output_folder, f'page_{page_num + 1}.png')
        pix.save(output_path)
        image_paths.append(output_path)
        logging.info(f"Page {page_num + 1} converted to image: {output_path}")
    
    return image_paths

# Store images in the specified folder
def store_images(image_paths, storage_folder):
    ensure_directory_exists(storage_folder)
    
    for image_path in image_paths:
        image = Image.open(image_path)
        image.save(os.path.join(storage_folder, os.path.basename(image_path)))
        logging.info(f"Image stored: {image_path}")
    
    return {"status": "success", "stored_images": len(image_paths)}

# Reviewer agent: Extract information from each page based on user query
def review_images(image_paths, query):
    model_id = "vikhyatk/moondream2"
    revision = "2024-07-23"
    model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, revision=revision)
    tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
    
    results = {}
    for image_path in image_paths:
        logging.info(f"Reviewing image: {image_path}")
        image = Image.open(image_path)
        enc_image = model.encode_image(image)
        response = model.answer_question(enc_image, query, tokenizer)
        results[image_path] = response
        logging.info(f"Extracted information from {image_path}: {response}")
    
    return results

# Generate the detailed query
def generate_query():
    attributes = [
        "PAN Copy Present or not",
        "PAN Number",
        "Name",
        "Gender",
        "POA Copy Present or not",
        "Type - UID, voter ID, Passport, Driver Licence",
        "Name",
        "Proof Number",
        "Download date",
        "Gender",
        "c/o",
        "Address [Pincode, State, City, locality, Landmark]",
        "Signature present or not",
        "Document",
        "KYC Mode",
        "Applicant Type",
        "Name",
        "Maiden Name",
        "Father’s Name",
        "DOB",
        "Gender",
        "Occupation",
        "Residential status",
        "Application Number",
        "Marital Status",
        "PAN Number",
        "Nationality",
        "Citizenship",
        "Address type",
        "Proof of address : Type",
        "Residential Address {Line 123, state, country, pincode, city}",
        "Email",
        "Mobile",
        "Corresponding address",
        "Name & E code",
        "Designation",
        "Date",
        "Intermediary Date",
        "Esign Date"
    ]
    
    query = "Please check if the following attributes are present in this document and extract the information: " + ", ".join(attributes)
    return query

class AgentState(TypedDict):
    pdf_path: str
    output_folder: str
    storage_folder: str
    query: str
    image_paths: Annotated[List[str], operator.add]
    review_results: Dict[str, str]

def call_convert_pdf_to_images(state):
    pdf_path = state['pdf_path']
    output_folder = state['output_folder']
    logging.info(f"Starting PDF to image conversion for {pdf_path}")
    image_paths = convert_pdf_to_images(pdf_path, output_folder)
    logging.info(f"PDF to image conversion completed. Images saved in {output_folder}")
    return {"image_paths": image_paths}

def call_store_images(state):
    image_paths = state['image_paths']
    storage_folder = state['storage_folder']
    logging.info(f"Storing images in {storage_folder}")
    result = store_images(image_paths, storage_folder)
    logging.info(f"Image storage completed. Stored {result['stored_images']} images")
    return result

def call_review_images(state):
    image_paths = state['image_paths']
    query = state['query']
    logging.info("Starting image review with the query: " + query)
    review_results = review_images(image_paths, query)
    logging.info("Image review completed")
    return {"review_results": review_results}

from langgraph.graph import StateGraph, END

workflow = StateGraph(AgentState)

workflow.add_node("convert_pdf_to_images", call_convert_pdf_to_images)
workflow.add_node("store_images", call_store_images)
workflow.add_node("review_images", call_review_images)

workflow.set_entry_point("convert_pdf_to_images")
workflow.add_edge("convert_pdf_to_images", "store_images")
workflow.add_edge("store_images", "review_images")
workflow.set_finish_point("review_images")

app = workflow.compile()

inputs = {
    "pdf_path": r"Docs\BGQPK4512E_31012024103628.pdf",
    "output_folder": r"output_images",
    "storage_folder": r"output_images",
    "query": generate_query(),
    "image_paths": [],
    "review_results": {}
}

logging.info("Workflow execution started")
result = app.invoke(inputs)
logging.info("Workflow execution completed")

# Save the review results to a DataFrame
review_results = result["review_results"]
df = pd.DataFrame.from_dict(review_results, orient='index')
df.reset_index(inplace=True)
df.rename(columns={'index': 'Image Path', 0: 'Extracted Information'}, inplace=True)

# Save DataFrame to a CSV file
output_csv_path = os.path.join(inputs['output_folder'], 'extracted_information.csv')
df.to_csv(output_csv_path, index=False)
logging.info(f"Extraction complete. Results saved to {output_csv_path}")

print("Extraction complete. Results saved to extracted_information.csv")

2024-07-25 17:26:10,898 - INFO - Workflow execution started
2024-07-25 17:26:10,902 - INFO - Starting PDF to image conversion for Docs\BGQPK4512E_31012024103628.pdf
2024-07-25 17:26:10,904 - INFO - Directory already exists: output_images
2024-07-25 17:26:10,932 - INFO - Page 1 converted to image: output_images\page_1.png
2024-07-25 17:26:10,954 - INFO - Page 2 converted to image: output_images\page_2.png
2024-07-25 17:26:10,972 - INFO - Page 3 converted to image: output_images\page_3.png
2024-07-25 17:26:10,995 - INFO - Page 4 converted to image: output_images\page_4.png
2024-07-25 17:26:11,028 - INFO - Page 5 converted to image: output_images\page_5.png
2024-07-25 17:26:11,059 - INFO - Page 6 converted to image: output_images\page_6.png
2024-07-25 17:26:11,079 - INFO - Page 7 converted to image: output_images\page_7.png
2024-07-25 17:26:11,098 - INFO - Page 8 converted to image: output_images\page_8.png
2024-07-25 17:26:11,099 - INFO - PDF to image conversion completed. Images saved i

InvalidUpdateError: Must write to at least one of ['pdf_path', 'output_folder', 'storage_folder', 'query', 'image_paths', 'review_results']