In [None]:
# Install pip dependenices
!pip install autogen-agentchat==0.4.2 --user
!pip install PyMuPDF --user

In [None]:
# configurations
pdf_path = "original/image_and_text_sample.pdf"
output_dir = "output/"
# Configure Google Cloud project
PROJECT_ID = "cryptic-skyline-411516"
REGION = "us-central1"

In [234]:
import os
import nest_asyncio
from typing import AsyncGenerator, List, Sequence, Tuple
import traceback
import asyncio
import fitz
from pathlib import Path
from autogen_agentchat.agents import BaseChatAgent
from autogen_agentchat.base import Response
from autogen_agentchat.messages import AgentEvent, ChatMessage, TextMessage, StopMessage
from autogen_agentchat.conditions import TextMentionTermination, HandoffTermination, StopMessageTermination
from autogen_agentchat.teams import RoundRobinGroupChat
from autogen_agentchat.ui import Console
from autogen_core import CancellationToken
from markitdown import MarkItDown
import vertexai
from vertexai.generative_models import GenerativeModel, Part, Image

# Initialize Vertex AI client
vertexai.init(project=PROJECT_ID, location=REGION)

In [251]:
def get_filename_without_extension(file_path):
    return os.path.splitext(os.path.basename(file_path))[0]

class UserProxyAgent(BaseChatAgent):
    def __init__(self, name: str) -> None:
        super().__init__(name, "A human user participating in the chat.")

    @property
    def produced_message_types(self) -> List[type[ChatMessage]]:
        return [TextMessage]

    async def on_messages(self, messages: Sequence[ChatMessage], cancellation_token: CancellationToken) -> Response:
        user_input = await asyncio.get_event_loop().run_in_executor(None, input, "Enter source PDF document Path:")
        print("input:",user_input)
        return Response(chat_message=TextMessage(content=user_input, source=self.name))

    async def on_reset(self, cancellation_token: CancellationToken) -> None:
        print("User Proxy Reset")

# Custom UserProxyAgent to interact with Gemini
class GeminiAgent(BaseChatAgent):
    def __init__(self, name: str, output_dir: str, mime_type : str = None):
        super().__init__(name=name, description="An agent that converts images to text.")
        self._output_dir = output_dir
        self._mime_type = mime_type

    @property
    def produced_message_types(self) -> Sequence[type[ChatMessage]]:
        return (TextMessage,)
        
    async def on_messages(self, messages: Sequence[ChatMessage], cancellation_token: CancellationToken) -> Response:
        # Call Gemini using the Google Cloud SDK
        if not messages:
            return Response(chat_message=TextMessage(content="No File Name received!", source=self.name))
        last_message = messages[-1].content.strip()
        filePath = last_message
        print("file:",filePath)
        response = await self.process(filePath=filePath, mime_type=self._mime_type)
        # Return the generated response from Gemini
        final_msg = TextMessage(content=f"created:{filePath}", source=self.name)
        return Response(chat_message=final_msg)

    async def on_reset(self, cancellation_token: CancellationToken) -> None:
        pass
    
    # Function to call Vertex AI Gemini for image-to-text summarization
    async def process(self, filePath: str, mime_type: str = None) -> str:
        try:
            model = GenerativeModel("gemini-1.5-flash-002")
            if(mime_type == "application/pdf"):
                print("summarise pdf...")            
                encoded=self.encode_pdf(filePath)
                #print("encoded", base64.b64decode(encoded))
                attachment_file = Part.from_data( mime_type="application/pdf",
                                        data=encoded)
                prompt = "Can you summarise content including images from the attached pdf in markdown?"
            else:
                print("summarise image...")  
                attachment_file = Part.from_image(Image.load_from_file(filePath))
                prompt = "Describe this image?"
            # Query the model
            print("total_tokens: ", model.count_tokens([attachment_file, prompt]))
            response = model.generate_content([attachment_file, prompt])
            print("usage_metadata: ", response.usage_metadata)
            filename = os.path.join(self._output_dir, get_filename_without_extension(filePath))+"_gemini.md"
            with open(filename, "w", encoding="utf-8") as file:
                file.write(response.text)
            return f"created:{filename}"
        except Exception as e:
            print("Exception : ", traceback.format_exc())
            return f"Error calling Vertex AI: {str(e)}"
        
    def encode_pdf(self, pdf_path):
        return Path(pdf_path).read_bytes()

class MarkItDownAgent(BaseChatAgent):
    def __init__(self, name, output_dir: str):
        super().__init__(name, "An agent to convert to Mark Down output.")
        self._output_dir = output_dir

    @property
    def produced_message_types(self) -> Sequence[type[ChatMessage]]:
        return (TextMessage,)

    async def on_messages(self, messages: Sequence[ChatMessage], cancellation_token: CancellationToken) -> Response:
        last_message = messages[-1].content.strip()
        filePath = last_message
        return await self.process(filePath)

    async def process(self, filePath: str) -> Response:
        inner_messages: List[ChatMessage] = []
        md = MarkItDown()
        inner_messages.append(TextMessage(content=f'Will Summarise Free option using MarkDown...', source=self.name))
        result = md.convert(filePath)
        filename = os.path.join(self._output_dir, get_filename_without_extension(filePath))+"_markitdown.md"
        print("content:",result.text_content)
        with open(filename, "w", encoding="utf-8") as file:
            file.write(result.text_content)
        #print(result.text_content)
        final_msg = TextMessage(content=f"created:{filename}", source=self.name)
        inner_messages.append(final_msg)
        return Response(chat_message=final_msg)

    async def on_reset(self, cancellation_token: CancellationToken) -> None:
        pass

class DocumentTriageJobAgent(BaseChatAgent):
    def __init__(self, name: str):
        super().__init__(name, "An agent to decide on how which agent to use for processing document.")

    @property
    def produced_message_types(self) -> Sequence[type[ChatMessage]]:
        return (TextMessage,)

    async def on_messages(self, messages: Sequence[ChatMessage], cancellation_token: CancellationToken) -> Response:
        if not messages:
            return Response(chat_message=TextMessage(content="No File Name received!", source=self.name))
        last_message = messages[-1].content.strip()
        filePath = last_message
        return await self.process(filePath)

    async def process(self, filePath: str) -> Response:
        inner_messages: List[ChatMessage] = []
        if not(os.path.isfile(filePath)):
            return Response(chat_message=StopMessage(content="File path provided isn't a valid file!", source=self.name), inner_messages=inner_messages)
        classifier_result = self.classifier(filePath)
        inner_messages.append(TextMessage(content=f'pdf classifier_result: {classifier_result}', source=self.name))
        print("document type:",classifier_result)
        if(classifier_result=="TEXT"):
            agent = MarkItDownAgent(name="markdown_agent", output_dir=output_dir)
        elif (classifier_result=="IMAGE"):
            agent = GeminiAgent(name="llm_agent", output_dir=output_dir, mime_type="application/pdf")
        await Console(
                agent.on_messages_stream(
                    [TextMessage(content=filePath, source="user")], CancellationToken()
                )
            )
        #print("final:",inner_messages)
        final_msg = TextMessage(content="Done!", source=self.name)
        return Response(chat_message=final_msg, inner_messages=inner_messages)

    async def on_reset(self, cancellation_token: CancellationToken) -> None:
        pass

    def classifier(self, pdf_file):
        with open(pdf_file,"rb") as f:
            pdf = fitz.open(f)
            res = []
            for page in pdf:
                img_refs = page.get_image_info(xrefs=True)
                if img_refs != []:
                    #print("Page", page.number, "images:", [i["xref"] for i in img_refs])
                    res.append([i["xref"] for i in img_refs])
            if  len(res) == 0:
                return("TEXT")
            else:
                return("IMAGE")
    
    # pdf_path = "original/image_and_text_sample.pdf"           
    # pdf_path = "original/text_sample.pdf"
    # file_path = pdf_path
    # classifier_result = classifier(file_path)
    # classifier_result

# debug run
# Main function to run the agent
# async def debug_agent(input_path: str):
#     agent = DocumentTriageJobAgent(name="document_triage_agent")
#     # User's input message
#     user_message = TextMessage(content=input_path, source="user")
#     # Send the message and get the response from Gemini
#     response = await agent.on_messages(messages=[user_message],
#         cancellation_token=CancellationToken(),
#     )
    
#     print(f"response: {response.chat_message}")

# #Run the async function
# await debug_agent("original/image_and_text_sample.pdf")

In [254]:
async def main():
    # Initialize agents
    user_proxy_agent = UserProxyAgent(name="user_proxy_agent")
    document_triage_agent = DocumentTriageJobAgent(name="document_triage_agent")
    # Define termination conditions
    termination = TextMentionTermination("Done!") | HandoffTermination(user_proxy_agent) | StopMessageTermination()
    # Create a chat group
    group_chat = RoundRobinGroupChat(
        participants=[user_proxy_agent, document_triage_agent],
        termination_condition=termination
    )
    # Programmatically provide the initial message
    initial_message = TextMessage(content="Enter Source file Name:", source="user")
    # Start the chat
    stream = group_chat.run_stream(cancellation_token=CancellationToken())
    # Await the stream to process messages
    await Console(stream)
nest_asyncio.apply()
asyncio.run(main())

input: original/text_sample.pdf
---------- user_proxy_agent ----------
document type: TEXT
content: A Concise

Dictionary
Old Icelandic

of

Fonts by Monokrom
Formatting by Prince

Published by Oxford University Press

1910

Original content by Geir T. Zoëga.
Scanning and proofreading by Sean Crist.
Further proofreading and HTML encoding by Tim Stridmann.
CSS styling by Håkon Wium Lie.

a

af

A

as

góð:

sheet-anchor;

a, a negative suffix to verbs, not; era út-
makligt, at it is not unmeet that.
abbadis (pl. -ar), f. abbess.
abbast (að), v. ref. to be angry, to quar-
rel (a. við e-n, upp á e-n).
abbindi (= afbindi), n. constipation.
aðal, n. nature, disposition.
aðal- in compds., chief, head, principal;
-akkeri, n.
-ból, n.
manor; -borinn, pp. of noble birth, =
óðal-borinn; -festr, f. see alaðsfestr;
-fylking, f. the main body of troops, cen-
tre; -haf, n. the high sea; -henda, f. =
full or perfect
alhenda; -hending, f.
(opposed to
blóð
rhyme,
‘skothending’); -hendr, a. (verse) wi