Activity 1

In [1]:
import time
import fitz  # PyMuPDF
from PIL import Image
import io

from vertexai import init
from vertexai.generative_models import GenerativeModel, Part, GenerationConfig

def extract(pdf_path: str, page_number: int = 0):
    """Extract prompt text and the first image from a PDF."""
    doc = fitz.open(pdf_path)
    page = doc[page_number]
    prompt_text = page.get_text().strip()

    images = page.get_images(full=True)
    if not images:
        raise ValueError("No images found on the specified page.")
    
    # Extracts first image from the page
    xref = images[0][0]
    base_image = doc.extract_image(xref)
    image_bytes = base_image["image"]
    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

    return prompt_text, image

def describe_image(pdf_path: str, page_number: int = 0):
    st = time.time()
    init(project="cjnyasg-customer360", location="us-central1")

    model = GenerativeModel("gemini-2.0-flash-001")

    prompt_text, image = extract(pdf_path, page_number)

    # Convert PIL image to bytes
    img_byte_arr = io.BytesIO()
    image.save(img_byte_arr, format='PNG')
    img_bytes = img_byte_arr.getvalue()

    # Create Gemini-compatible Parts
    image_part = Part.from_data(data=img_bytes, mime_type="image/png")
    prompt_part = Part.from_text(prompt_text)

    input_parts = [prompt_part, image_part]

    generation_config = GenerationConfig(temperature=0.2, max_output_tokens=1024)

    response = model.generate_content(contents=input_parts, generation_config=generation_config)
    description = response.text.strip()

    result = {"Description": description}
    print("Time elapsed:", time.time() - st)
    print(result)
    print(type(result))

    return result

# === Usage ===
if __name__ == "__main__":
    describe_image("C:/Users/richard.c.ferrer/Agents/image_ai.pdf") 


Time elapsed: 8.20395541191101
{'Description': "The image shows a brown dachshund dog standing on its hind legs in front of a black electric piano. The dog is wearing a black collar with silver accents. Its front paws are resting on the piano keys, as if it is playing the instrument. Sheet music is propped up on the piano's music stand. The background is blurred, but it appears to be a room with a plant and curtains. The overall impression is humorous and whimsical, suggesting the dog is a talented musician."}
<class 'dict'>
