# Phase 0 — Bedrock Claude 3.5 Sonnet Caption Generator

This notebook generates a concise caption from an image using AWS Bedrock (Anthropic Claude 3.5 Sonnet).
- Set up AWS credential using aws configure in bash



In [1]:
# %pip install -q boto3 botocore

import os
import json
import base64
import mimetypes

import boto3



In [31]:
MODEL_ID = "us.anthropic.claude-3-5-sonnet-20241022-v2:0"
PROMPT_FILE = "phase0/prompts/caption_generation_prompt.txt"  
IMAGE_PATH = "video_cover1.jpg" 
VIDEO_DESCRIPTION = "我和我的柴犬宝宝训练的搞笑视频，我先演示一遍动作，柴犬就跟着做一遍动作，比如坐，趴下，打滚。"
MAX_TOKENS = 1024
TEMPERATURE = 1
PRINT_PAYLOAD = True
DRY_RUN = False


In [27]:
# Read prompt and substitute placeholder (robust path resolution)
resolved_prompt = PROMPT_FILE
if not os.path.isfile(resolved_prompt):
    candidates = [
        "prompts/caption_generation_prompt.txt",  # when running from phase0/
        "phase0/prompts/caption_generation_prompt.txt",  # when running from repo root
        os.path.join("..", "phase0", "prompts", "caption_generation_prompt.txt"),
    ]
    for c in candidates:
        if os.path.isfile(c):
            resolved_prompt = c
            break

with open(resolved_prompt, "r", encoding="utf-8") as f:
    prompt_template = f.read()

filled_prompt = prompt_template.replace("{video_description}", VIDEO_DESCRIPTION)
print(f"Text Messgae in Prompt Loaded:\n")
print(filled_prompt)


Text Messgae in Prompt Loaded:

<Role>
You are an expert social media manager designing creative captions.
</Role>

<Input>
Video Cover: An image for the video cover. This is what the audience will see in their home page. 
Video Description: Below is a description of what the video is about:
我和我的柴犬宝宝训练的搞笑视频，我先演示一遍动作，柴犬就跟着做一遍动作，比如坐，趴下，打滚。
</Input>

<Task>
- Respond in the same language as main language used in Video Description, if not provided, use English. 
- Step 1: Carefully analyze the input Video Cover and Video Description to understand the key message or highlight of the video, then summarize the selling point we should leverage to attract audience.
- Step 2: Identify 3 different video title stype for the video, ex: humor, emotional, cute.
- Step 3: For each style from Step 2, generate ONE concise, catchy video title under a soft limit of 20 characters. Make engaging, natural, and suitable for social media, along with the Video Cover it should attract audience to click and engag

In [33]:
# Prepare image and build payload (robust path resolution)
resolved_image = IMAGE_PATH
if not os.path.isfile(resolved_image):
    candidates = [
        IMAGE_PATH,
        os.path.join("..", "video_cover1.jpg"),            # if running from phase0/
        os.path.join("phase0", "video_cover1.jpg"),         # if running from repo root
    ]
    for c in candidates:
        if os.path.isfile(c):
            resolved_image = c
            break

assert os.path.isfile(resolved_image), f"Image not found: {IMAGE_PATH} (checked also: {candidates})"

with open(resolved_image, "rb") as f:
    image_b64 = base64.b64encode(f.read()).decode("utf-8")

mime, _ = mimetypes.guess_type(resolved_image)
image_mime = mime or "application/octet-stream"

payload = {
    "anthropic_version": "bedrock-2023-05-31",
    "max_tokens": MAX_TOKENS,
    "temperature": TEMPERATURE,
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": image_mime,
                        "data": image_b64,
                    },
                },
                {"type": "text", "text": filled_prompt},
            ],
        }
    ],
}

if PRINT_PAYLOAD:
    print("Using image:", resolved_image)
    print(json.dumps(payload, indent=2)[:800])


Using image: ../video_cover1.jpg
{
  "anthropic_version": "bedrock-2023-05-31",
  "max_tokens": 1024,
  "temperature": 1,
  "messages": [
    {
      "role": "user",
      "content": [
        {
          "type": "image",
          "source": {
            "type": "base64",
            "media_type": "image/jpeg",
            "data": "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgFBgcGBQgHBgcJCAgJDBMMDAsLDBgREg4THBgdHRsYGxofIywlHyEqIRobJjQnKi4vMTIxHiU2OjYwOiwwMTD/2wBDAQgJCQwKDBcMDBcwIBsgMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDD/wAARCASQBJIDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2u


In [34]:
# Create Bedrock client and invoke
if not DRY_RUN:
    bedrock = boto3.client("bedrock-runtime", region_name='us-east-2')
    resp = bedrock.invoke_model(modelId=MODEL_ID, body=json.dumps(payload))
    raw = resp.get("body")
    text = raw.read().decode("utf-8") if hasattr(raw, "read") else str(raw)

    response_json = json.loads(text)
    print(response_json['content'][0]['text'])
   
else:
    print("[dry-run] Skipping Bedrock invocation.")


{
  "selling_point": "聪明可爱的柴犬模仿主人做各种动作训练",
  "captions": [
    {
      "style": "搞笑",
      "caption": "柴犬：主人我学你的"
    },
    {
      "style": "萌宠",
      "caption": "柴犬小课堂：模仿篇"
    },
    {
      "style": "温馨",
      "caption": "和柴犬的快乐训练时光"
    }
  ]
}
