# Grok Use Case: Image Inputs
### This notebook demonstrates how to use Grok for analyzing and reasoning over image inputs, specifically focusing on software architecture diagrams.

In [None]:
import base64
import os
import textwrap

from dotenv import load_dotenv

from autogen import LLMConfig, UserProxyAgent
from autogen.agentchat import initiate_group_chat
from autogen.agentchat.assistant_agent import AssistantAgent
from autogen.agentchat.conversable_agent import ConversableAgent
from autogen.agentchat.group import AgentNameTarget
from autogen.agentchat.group.llm_condition import StringLLMCondition
from autogen.agentchat.group.on_condition import OnCondition
from autogen.agentchat.group.patterns.pattern import DefaultPattern

load_dotenv()

In [None]:
# Initialize LLMConfig for Grok
llm_config = LLMConfig(
    config_list=[
        {
            "model": "grok-4",
            "api_type": "openai",  # Use existing openai type only
            "base_url": "https://api.x.ai/v1",
            "api_key": os.getenv("XAI_API_KEY"),
            "max_tokens": 1000,
        }
    ],
    temperature=0.5,
)
image_config = LLMConfig(
    api_type="responses", model="grok-4", api_key=os.getenv("XAI_API_KEY"), built_in_tools=["image_generation"]
)

## The Example Demonsrates image generation and captioning capabilities of grok 4 with following architecture.

1. **Image Generation:** Highly detailed Image Generation.
2. **Image Captioning:** Precise Image OCR capabilities.

### Solution Architect Agent architecture

1. Analyst agent (OCR on Image)
2. Solution Architect (Enhance existing architecture)
3. User Agent 
4. Design Agent (for Generating and performing analysis on image)

In [None]:
with llm_config:
    analyst = AssistantAgent(
        name="analyst",
        system_message=textwrap.dedent("""
        You are an Analyst agent that can reason over images.
        You will be provided with an image and you will need to analyze it.
        the image will most probably an image of a software architecture.
        You will need to analyze the image and provide a detailed analysis of the software architecture.
    """).strip(),
    )

    solution_architect = ConversableAgent(
        name="solution_architect",
        system_message=textwrap.dedent("""
        You are a solution architect that can reason over descriptions of an software architecture.
        You will be provided with a description of a software architecture and you will need to analyze it.
        You will need to analyze the description and provide and propose a new software architecture with enhancements.
        the new architecture should be more efficient, secure, and scalable.
        the new architecture should include the following components:
        1) IMPORTANT: only provide the FLOW of new Architecture components from start to end.
        2) IMPORTANT: flow should be concise and to the point. as a graph with description of each node and connection.
        3) exit once image is generated.
    """).strip(),
        max_consecutive_auto_reply=1,
    )

    user_agent = UserProxyAgent(
        name="user",
        human_input_mode="ALWAYS",
    )

design_agent = AssistantAgent(
    name="design_agent",
    llm_config=llm_config,
    system_message=textwrap.dedent("""
    generate images for software architecture.
    the image should be a flow of the software architecture.
    the image should be in a format that can be used to generate a software architecture.
    # if solution architect returns a new software architecture flow, you should generate an image for the new software architecture flow.
    """).strip(),
    max_consecutive_auto_reply=1,
)

In [None]:
# ----helper function to save image from base64 string----
def save_b64_png(b64_str, fname="generated.png"):
    with open(fname, "wb") as f:
        f.write(base64.b64decode(b64_str))
    print(f"image saved → {fname}")


def save_artbot_images_from_response(response):
    messages = response.messages
    for i in range(len(messages)):
        print(i)
        message = messages[i]
        if message.get("name") == "design_agent":
            contents = message.get("content", [])
            for content in contents:
                if (
                    content.get("type") == "tool_call"
                    and content.get("name") == "image_generation"
                    and "content" in content
                    and content["content"]
                ):
                    print("Saving image!")
                    save_b64_png(content["content"], f"image{i}.png")

### Define tools for agent and tool description
1. To Get Image Descriptions
2. To Generate Image

In [None]:
decription_tool_prompt = """
This tool is used to get the description of the architecture image.
Input Args:
- image_url: str (url of the architecture image)
"""


@analyst.register_for_llm(description=decription_tool_prompt)
@user_agent.register_for_execution(description=decription_tool_prompt)
async def get_image_description(image_url: str):
    prompt = f"""
    Given the following architecture image: {image_url}
    Return a short and concise description of the image.
    Then, provide the flow of the architecture in clear, numbered or bulleted points.
    Format:
    Description: <one paragraph understanding the architecture>
    Flow:
    1. <first step/component>(description)
    2. <second step/component>(description)
    ...
    Only include the essential components and their order in the flow.
    """
    chat = {
        "role": "user",
        "content": [
            {
                "type": "input_text",
                "text": textwrap.dedent(f"""
        {prompt}
    """).strip(),
            },
            {"type": "image_url", "image_url": {"url": image_url, "detail": "high"}},
        ],
    }
    design_agent.run(message=chat, user_input=False, max_rounds=1).process()
    last_message = design_agent.last_message()
    return last_message["content"]


tool_prompt = """
This tool is used to generate an architecture flowchart image for the provided software architecture flow.
Input Args:
- architecture_flow: str (detail flow of the software architecture in numbered or bulleted points)
"""


@solution_architect.register_for_llm(description=tool_prompt)
@user_agent.register_for_execution(description=tool_prompt)
async def design_architecture(architecture_flow: str):
    response = design_agent.run(
        message=f"generate an architecture flowchart image for the following software architecture flow: {architecture_flow}",
        chat_history=True,
        user_input=False,
        max_turns=1,
    ).process()

    last_message = design_agent.last_message()
    save_artbot_images_from_response(response)
    return last_message["content"][-1]

### DefaultPattern utilizing an LLM-based handoff condition

In [None]:
default_pattern = DefaultPattern(
    initial_agent=analyst,
    agents=[analyst, solution_architect],
    user_agent=user_agent,
    group_manager_args={"llm_config": llm_config},
)

analyst.handoffs.add_llm_conditions([
    OnCondition(
        target=AgentNameTarget("solution_architect"),
        condition=StringLLMCondition(prompt="When Analyst agent returns Description/Analysis of an Architecture Image"),
    ),
])

In [None]:
IMAGE_URL = "https://user-images.githubusercontent.com/65826354/179526761-7f473e3d-f71c-429d-bf49-16958c5cb7a6.png"
default_paresult, context, last_agent = initiate_group_chat(
    pattern=default_pattern,
    messages=f"Describe this image {IMAGE_URL} provide a detailed analysis of the software architecture.",
    max_rounds=20,
)