In [None]:
%pip install google-genai
%pip install matplotlib
%pip install os
%pip install dotenv
%pip install pydantic

In [None]:
from google import genai
from matplotlib import pyplot as plt
from dotenv import load_dotenv
from google.genai import types
from pydantic import BaseModel, Field
from urllib.error import URLError
import requests
from PIL import Image
from io import BytesIO
import os
from typing import Union

In [None]:
class ArtworkDiscovery(BaseModel):
    answer: str = Field(..., description="The answer to the prompt")
    reasoning: str = Field(..., description="The reasoning behind the answer")

class PostcardLabel(BaseModel):
    title: str = Field(..., description="The title of the artwork on the postcard")
    dimensions: str = Field(..., description="The dimensions of the postcard")
    manufacturing_date: str = Field(..., description="The manufacturing date of the postcard")
    manufacturing_location: str = Field(..., description="The manufacturing location of the postcard")
    brand: str = Field(..., description="The brand of the postcard")
    manufacturer: str = Field(..., description="The manufacturer of the postcard")
    phone_number: str = Field(..., description="The phone number of the postcard manufacturer")
    address: str = Field(..., description="The address of the postcard manufacturer")
    serial_number: str = Field(..., description="The serial number of the postcard")
    price: float = Field(..., description="The price of the postcard")
    reasoning: str = Field(..., description="The reasoning behind the answer")

class VisionTestCase(BaseModel):
    image_url: str = Field(..., description="The URL of the image to be analyzed")
    prompt: str = Field(..., description="The prompt to be answered based on the image")
    response_model: Union[type[ArtworkDiscovery], type[PostcardLabel]]  = Field(..., description="The Pydantic model that defines the expected response format")

In [None]:
def create_vertexai_client():    
    cloud_api_key = os.getenv("GOOGLE_CLOUD_API_KEY")
    if not cloud_api_key:
        raise ValueError("GOOGLE_CLOUD_API_KEY not found in .env file")
    
    # Configure the client with your API key
    client = genai.Client(
        vertexai=True, 
        api_key=cloud_api_key, 
    )

    return client

In [None]:
def load_image_from_url(url: str):
    try:
        response = requests.get(url=url)
        img = Image.open(BytesIO(response.content))
        plt.imshow(img)
        plt.axis('off')
        plt.show()
    except requests.HTTPError as e:
        # e.code contains the status code (e.g., 404)
        if e.code == 404:
            print("Error: URL not found (404).")
        else:
            print(f"HTTP Error: {e.code}")
       
    except URLError:
        print(f"Error: The file at '{url}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
load_dotenv()

# Configure the client with your API key
# client = create_vertexai_client()
client = create_vertexai_client()

In [None]:
# Resources
# Blog: https://blog.google/innovation-and-ai/technology/developers-tools/agentic-vision-gemini-3-flash/
# Code Sample: https://ai.google.dev/gemini-api/docs/code-execution#images

def clean_json_string(raw_string):
    # Remove the markdown code blocks
    clean_str = raw_string.strip()
    if clean_str.startswith("```json"):
        clean_str = clean_str[7:]
    if clean_str.endswith("```"):
        clean_str = clean_str[:-3]
    return clean_str.strip()

def curate_artwork_postcard(test_case: VisionTestCase) -> types.GenerateContentResponse:
    response = client.models.generate_content(
        model="gemini-3-flash-preview",
        contents=[
            types.Content(
                role="user",
                parts=[
                    types.Part.from_uri(file_uri=test_case.image_url),
                    types.Part(text=test_case.prompt),
                ]
            )
        ],
        config=types.GenerateContentConfig(
            response_mime_type="application/json",
            response_json_schema=test_case.response_model.model_json_schema(),
            thinking_config=types.ThinkingConfig(
                thinking_level=types.ThinkingLevel.HIGH
            ),
            media_resolution=types.MediaResolution.MEDIA_RESOLUTION_HIGH,
            tools=[types.Tool(code_execution=types.ToolCodeExecution)]  
        )
    )

    return response

def print_parts(response: types.GenerateContentResponse):
    for part in response.candidates[0].content.parts:
        if part.text is not None and part.text.strip():
            print("part.text -> ", part.text.strip())
        if part.executable_code is not None:
            print("part.executable_code -> ", part.executable_code)
        if part.code_execution_result is not None:
            print("part.code_execution_result -> ", part.code_execution_result)
        if part.as_image() is not None:
            # display() is a standard function in Jupyter/Colab notebooks
            display(Image.open(BytesIO(part.as_image().image_bytes)))

def print_artwork_result(test_case: VisionTestCase):
    response = curate_artwork_postcard(test_case=test_case)

    if response.parsed:
        result = test_case.response_model.model_validate(response.parsed) 
    else:
        result = test_case.response_model.model_validate_json(
            clean_json_string(response.text)
        )

    print_parts(response=response)
    if isinstance(result, ArtworkDiscovery):
        print("Final Answer: ", result.answer, "\nReasoning: ", result.reasoning)
    elif isinstance(result, PostcardLabel):
        print("Title: ", result.title,
              "\nDimensions: ", result.dimensions, 
              "\nManufacturing Date: ", result.manufacturing_date, 
              "\nManufacturing Location: ", result.manufacturing_location, 
              "\nBrand: ", result.brand, 
              "\nManufacturer: ", result.manufacturer, 
              "\nPhone Number: ", result.phone_number, 
              "\nAddress: ", result.address, 
              "\nSerial Number: ", result.serial_number, 
              "\nPrice: ", result.price, 
              "\nReasoning: ", result.reasoning
        )

def print_test_cases(heading: str, cases: list[VisionTestCase]):
    print(heading)
    for test_case in cases:
        print_artwork_result(test_case=test_case)

def show_the_postcard_front_and_back(front: str, back: str):
    agent_vision_base_url = "https://raw.githubusercontent.com/railsstudent/colab_images/refs/heads/main/agentic_visions"
    front_url = f"{agent_vision_base_url}/{front}"
    back_url = f"{agent_vision_base_url}/{back}"

    load_image_from_url(url=front_url)
    load_image_from_url(url=back_url)

    return front_url, back_url

def make_postcard_label_testcase(back_url: str) -> types.GenerateContentResponse:
  return VisionTestCase(
    image_url=back_url, 
    prompt=(
        "Zoom to the label and find:"
        "1) the title of the artwork" 
        "2) the dimensions"
        "3) the manufacturing date of the postcard"
        "4) the manufacturing location of the postcard"
        "5) the brand of the postcard"
        "6) the manufacturer of the postcard"
        "7) the phone number of the postcard manufacturer"
        "8) the address of the postcard manufacturer"
        "9) the serial number of the postcard"
        "10) the price of the postcard"
    ),
    response_model=PostcardLabel
)

In [None]:
front_url, back_url = show_the_postcard_front_and_back(
    front="up-the-river-front.jpg", back="up-the-river-back.jpg"
)

front_url2, back_url2 = show_the_postcard_front_and_back(
    front="bridge_front.jpg", 
    back="bridge_back.jpg"
)

front_url3, back_url3 = show_the_postcard_front_and_back(
    front="fu-chun-mountain-front.jpg", 
    back="fu-chun-mountain-back.jpg"
)

In [None]:
up_the_river_test_cases = [
    VisionTestCase(
        image_url=front_url,
        prompt="Zoom to bottom of the postcard to find the number of horses/donkeys/mules near the wagons",
        response_model=ArtworkDiscovery
    ),
]

print_test_cases(heading="Up the River test cases", cases=up_the_river_test_cases)

In [None]:
up_the_river_test_cases = [
    VisionTestCase(
        image_url=front_url, 
        prompt="Zoom to the courtyard where the swing is. Please tell me the number of people in there.",
        response_model=ArtworkDiscovery
    ),
    make_postcard_label_testcase(back_url=back_url),
]

print_test_cases(heading="Up the River test cases", cases=up_the_river_test_cases)

In [None]:
bridge_test_cases = [
    VisionTestCase(
        image_url=front_url2, 
        prompt="Zoom to the bridge and describe what the monks are doing.",
        response_model=ArtworkDiscovery
    ),
    VisionTestCase(
        image_url=front_url2, 
        prompt="Zoom to the top of the bridge and describe what is there.",
        response_model=ArtworkDiscovery
    ),
]

print_test_cases(heading="Up the River test cases", cases=bridge_test_cases)

In [None]:
bridge_test_cases = [
    VisionTestCase(
        image_url=front_url2, 
        prompt="Zoom to the entrance of the bridge and describe what the children are doing with a hawker.",
        response_model=ArtworkDiscovery
    ),
    make_postcard_label_testcase(back_url=back_url2),
]

print_test_cases(heading="Up the River test cases", cases=bridge_test_cases)

In [None]:
mountain_test_cases = [
    VisionTestCase(  
        image_url=front_url3, 
        prompt="Zoom to the bottom of the postcard to find the fish boat. Then, describe how many people found and what they are doing.",
        response_model=ArtworkDiscovery
    ),
    VisionTestCase(
        image_url=front_url3,
        prompt="Zoom to the right of the postcard to find the number of houses and describe their external appearances in 300 words maximum.",
        response_model=ArtworkDiscovery
    ),
    make_postcard_label_testcase(back_url=back_url3),
]

print_test_cases(heading="Fu Chun Mountain test cases", cases=mountain_test_cases)