# AI Agents - Decision based Document Summazization Technique
## Objectove
Build an agentic workflow to demonstrate how agents can be used to perform cost-effective document summarization of PDF content containing text and images utilizing multiple PDF summarization tools.
## Implementation
- Use AI Agent Framework (Autogen 0.4.*) to run multiple agents
- Use AI Agent to make decisions based on input PDF
- Summarize any given source PDF to MarkDown(.md) content
- Basic free PDF Summarization tool is Microsoft's latest open source MarkItDown library
- LLM PDF Summarization tool is Gemini 1.5 Flash

In [7]:
# Install pip dependenices
!pip install autogen-agentchat==0.4.5 --user
!pip install PyMuPDF --user

Collecting autogen-agentchat==0.4.5
  Using cached autogen_agentchat-0.4.5-py3-none-any.whl.metadata (2.5 kB)
Collecting autogen-core==0.4.5 (from autogen-agentchat==0.4.5)
  Using cached autogen_core-0.4.5-py3-none-any.whl.metadata (2.3 kB)
Collecting protobuf~=5.29.3 (from autogen-core==0.4.5->autogen-agentchat==0.4.5)
  Using cached protobuf-5.29.3-cp310-abi3-win_amd64.whl.metadata (592 bytes)
Using cached autogen_agentchat-0.4.5-py3-none-any.whl (64 kB)
Using cached autogen_core-0.4.5-py3-none-any.whl (78 kB)
Using cached protobuf-5.29.3-cp310-abi3-win_amd64.whl (434 kB)
Installing collected packages: protobuf, autogen-core, autogen-agentchat
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.25.6
    Uninstalling protobuf-4.25.6:
      Successfully uninstalled protobuf-4.25.6
  Attempting uninstall: autogen-core
    Found existing installation: autogen-core 0.4.2
    Uninstalling autogen-core-0.4.2:
      Successfully uninstalled autogen-core-0.4.2
  Atte

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-ai-generativelanguage 0.6.6 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 5.29.3 which is incompatible.
paddlepaddle 2.6.2 requires protobuf<=3.20.2,>=3.1.0; platform_system == "Windows", but you have protobuf 5.29.3 which is incompatible.
streamlit 1.38.0 requires pillow<11,>=7.1.0, but you have pillow 11.1.0 which is incompatible.

[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
# configurations
pdf_path = "original/image_and_text_sample.pdf"
output_dir = "output/"
# Configure Google Cloud project
PROJECT_ID = "cryptic-skyline-411516"
REGION = "us-central1"

In [None]:
# Example documents for process
# pdf_path = "original/image_and_text_sample.pdf"           
# pdf_path = "original/text_sample.pdf"

In [2]:
# import modules
import os
import nest_asyncio
from typing import AsyncGenerator, List, Sequence, Tuple
import traceback
import asyncio
import fitz
from pathlib import Path
from autogen_ext.models.openai import OpenAIChatCompletionClient
from autogen_agentchat.agents import BaseChatAgent
from autogen_agentchat.base import Response
from autogen_agentchat.messages import AgentEvent, ChatMessage, TextMessage, StopMessage
from autogen_agentchat.conditions import TextMentionTermination, HandoffTermination, StopMessageTermination
from autogen_agentchat.teams import RoundRobinGroupChat, SelectorGroupChat
from autogen_agentchat.ui import Console
from autogen_core import CancellationToken
from markitdown import MarkItDown
import vertexai
from vertexai.generative_models import GenerativeModel, Part, Image

# Initialize Vertex AI client
vertexai.init(project=PROJECT_ID, location=REGION)

In [4]:
def get_filename_without_extension(file_path):
    return os.path.splitext(os.path.basename(file_path))[0]

# define agents
class UserProxyAgent(BaseChatAgent):
    def __init__(self, name: str) -> None:
        super().__init__(name, "A human user participating in the chat.")

    @property
    def produced_message_types(self) -> List[type[ChatMessage]]:
        return [TextMessage]

    async def on_messages(self, messages: Sequence[ChatMessage], cancellation_token: CancellationToken) -> Response:
        user_input = await asyncio.get_event_loop().run_in_executor(None, input, "Enter source PDF document Path:")
        print("input:",user_input)
        return Response(chat_message=TextMessage(content=user_input, source=self.name))

    async def on_reset(self, cancellation_token: CancellationToken) -> None:
        print("User Proxy Reset")

# Custom UserProxyAgent to interact with Gemini
class GeminiAgent(BaseChatAgent):
    def __init__(self, name: str, output_dir: str, mime_type : str = None):
        super().__init__(name=name, description="An agent that converts images to text.")
        self._output_dir = output_dir
        self._mime_type = mime_type

    @property
    def produced_message_types(self) -> Sequence[type[ChatMessage]]:
        return (TextMessage,)
        
    async def on_messages(self, messages: Sequence[ChatMessage], cancellation_token: CancellationToken) -> Response:
        # Call Gemini using the Google Cloud SDK
        if not messages:
            return Response(chat_message=StopMessage(content="No File Name received!", source=self.name))
        last_message = messages[-1].content.strip()
        filePath = last_message
        print("file:",filePath)
        response = await self.process(filePath=filePath, mime_type=self._mime_type)
        # Return the generated response from Gemini
        final_msg = TextMessage(content=f"created:{filePath}", source=self.name)
        return Response(chat_message=final_msg)

    async def on_reset(self, cancellation_token: CancellationToken) -> None:
        pass
    
    async def process(self, filePath: str, mime_type: str = None) -> str:
        try:
            model = GenerativeModel("gemini-1.5-flash-002")
            if(mime_type == "application/pdf"):
                print("summarise pdf...")            
                encoded=self.encode_pdf(filePath)
                #print("encoded", base64.b64decode(encoded))
                attachment_file = Part.from_data( mime_type="application/pdf",
                                        data=encoded)
                prompt = "Can you summarise content including images from the attached pdf in markdown?"
            else:
                print("summarise image...")  
                attachment_file = Part.from_image(Image.load_from_file(filePath))
                prompt = "Describe this image?"
            # Query the model
            print("total_tokens: ", model.count_tokens([attachment_file, prompt]))
            response = model.generate_content([attachment_file, prompt])
            print("usage_metadata: ", response.usage_metadata)
            filename = os.path.join(self._output_dir, get_filename_without_extension(filePath))+"_gemini.md"
            with open(filename, "w", encoding="utf-8") as file:
                file.write(response.text)
            return f"created:{filename}"
        except Exception as e:
            print("Exception : ", traceback.format_exc())
            return f"Error calling Vertex AI: {str(e)}"
        
    def encode_pdf(self, pdf_path):
        return Path(pdf_path).read_bytes()

class MarkItDownAgent(BaseChatAgent):
    def __init__(self, name, output_dir: str):
        super().__init__(name, "An agent to convert to Mark Down output.")
        self._output_dir = output_dir

    @property
    def produced_message_types(self) -> Sequence[type[ChatMessage]]:
        return (TextMessage,)

    async def on_messages(self, messages: Sequence[ChatMessage], cancellation_token: CancellationToken) -> Response:
        if not messages:
            return Response(chat_message=StopMessage(content="No File Name received!", source=self.name))
        last_message = messages[-1].content.strip()
        filePath = last_message
        return await self.process(filePath)

    async def process(self, filePath: str) -> Response:
        inner_messages: List[ChatMessage] = []
        md = MarkItDown()
        inner_messages.append(TextMessage(content=f'Will Summarise Free option using MarkDown...', source=self.name))
        result = md.convert(filePath)
        filename = os.path.join(self._output_dir, get_filename_without_extension(filePath))+"_markitdown.md"
        print("content:",result.text_content)
        with open(filename, "w", encoding="utf-8") as file:
            file.write(result.text_content)
        #print(result.text_content)
        final_msg = TextMessage(content=f"created:{filename}", source=self.name)
        inner_messages.append(final_msg)
        return Response(chat_message=final_msg)

    async def on_reset(self, cancellation_token: CancellationToken) -> None:
        pass

class DocumentTriageJobAgent(BaseChatAgent):
    def __init__(self, name: str):
        super().__init__(name, "An agent to decide on how which agent to use for processing document.")

    @property
    def produced_message_types(self) -> Sequence[type[ChatMessage]]:
        return (TextMessage,)

    async def on_messages(self, messages: Sequence[ChatMessage], cancellation_token: CancellationToken) -> Response:
        if not messages:
            return Response(chat_message=StopMessage(content="No File Name received!", source=self.name))
        last_message = messages[-1].content.strip()
        filePath = last_message
        return await self.process(filePath)

    async def process(self, filePath: str) -> Response:
        inner_messages: List[ChatMessage] = []
        if not(os.path.isfile(filePath)):
            return Response(chat_message=StopMessage(content="File path provided isn't a valid file!", source=self.name), inner_messages=inner_messages)
        classifier_result = self.classifier(filePath)
        inner_messages.append(TextMessage(content=f'pdf classifier_result: {classifier_result}', source=self.name))
        print("document type:",classifier_result)
        if(classifier_result=="TEXT"):
            agent = MarkItDownAgent(name="markdown_agent", output_dir=output_dir)
        elif (classifier_result=="IMAGE"):
            agent = GeminiAgent(name="llm_agent", output_dir=output_dir, mime_type="application/pdf")
        await Console(
                agent.on_messages_stream(
                    [TextMessage(content=filePath, source="user")], CancellationToken()
                )
            )
        #print("final:",inner_messages)
        final_msg = TextMessage(content="Done!", source=self.name)
        return Response(chat_message=final_msg, inner_messages=inner_messages)

    async def on_reset(self, cancellation_token: CancellationToken) -> None:
        pass

    def classifier(self, pdf_file):
        with open(pdf_file,"rb") as f:
            pdf = fitz.open(f)
            res = []
            for page in pdf:
                img_refs = page.get_image_info(xrefs=True)
                if img_refs != []:
                    #print("Page", page.number, "images:", [i["xref"] for i in img_refs])
                    res.append([i["xref"] for i in img_refs])
            if  len(res) == 0:
                return("TEXT")
            else:
                return("IMAGE")
# debug run
# Main function to run the agent
# async def debug_agent(input_path: str):
#     agent = DocumentTriageJobAgent(name="document_triage_agent")
#     # User's input message
#     user_message = TextMessage(content=input_path, source="user")
#     # Send the message and get the response from Gemini
#     response = await agent.on_messages(messages=[user_message],
#         cancellation_token=CancellationToken(),
#     )
    
#     print(f"response: {response.chat_message}")

# #Run the async function
# await debug_agent("original/image_and_text_sample.pdf")

In [6]:
# initiate agents
async def main():
    # Initialize agents
    user_proxy_agent = UserProxyAgent(name="user_proxy_agent")
    document_triage_agent = DocumentTriageJobAgent(name="document_triage_agent")
    markitdown_agent = MarkItDownAgent(name="markdown_agent", output_dir=output_dir)
    llm_agent = GeminiAgent(name="llm_agent", output_dir=output_dir, mime_type="application/pdf")
    # Define termination conditions
    termination = TextMentionTermination("Done!") | HandoffTermination(user_proxy_agent) | StopMessageTermination()
    # Create a chat group
    group_chat = RoundRobinGroupChat(
        participants=[user_proxy_agent, document_triage_agent],
        termination_condition=termination
    )

    group_chat = SelectorGroupChat(
        [markitdown_agent, llm_agent],
        model_client=model_client,
        selector_func=selector_func,
        termination_condition=termination,
        max_turns=3
    )

    # Programmatically provide the initial message
    initial_message = TextMessage(content="Enter Source file Name:", source="user")
    # Start the chat
    stream = group_chat.run_stream(cancellation_token=CancellationToken())
    # Await the stream to process messages
    await Console(stream)
nest_asyncio.apply()
asyncio.run(main())

[]
---------- markdown_agent ----------
No File Name received!
