In [19]:
from loader import load_strips
import requests
import base64
import os
import json
import cv2
import time

In [20]:
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

In [21]:
prompt5 = """
You are a specialized comic transcription engine.

Your task is to transcribe the comic strip image into structured JSON.

---------------------------------------
SCANNING INSTRUCTIONS
---------------------------------------
1. Scan the strip from left to right.
2. Detect vertical borders between panels.
3. Each separated region is a new panel.
4. Preserve reading order strictly.

---------------------------------------
SPEAKER IDENTIFICATION
---------------------------------------
- DILBERT: Man in white shirt with red/black tie.
- DOGBERT: Small white dog.
- BACKGROUND CHARACTER: Any other character.
- Follow the speech bubble pointer line carefully to identify the speaker.

---------------------------------------
OUTPUT FORMAT (STRICT JSON ONLY)
---------------------------------------
Return ONLY valid JSON in this exact structure:

{
  "panels": [
    {
      "panel_number": 1,
      "dialogue": [
        {"speaker": "CharacterName", "text": "Exact dialogue text"}
      ]
    }
  ]
}

---------------------------------------
STRICT RULES
---------------------------------------
- Do NOT include explanations.
- Do NOT include markdown formatting.
- Do NOT wrap output in backticks.
- Do NOT summarize.
- Do NOT paraphrase.
- Preserve exact wording and capitalization.
- If a panel has no dialogue, return:
  {"panel_number": N, "dialogue": []}

Return JSON only.
"""

In [22]:
def extract_balanced_json(raw_string):
    """
    Scans a string backwards from the last closing brace to find the 
    matching opening brace, effectively isolating the final JSON object.
    """
    # Find the last occurrence of '}'
    end_idx = raw_string.rfind('}')
    if end_idx == -1:
        return None  # No JSON found

    brace_count = 0
    start_idx = -1

    # Walk backwards from the last '}'
    for i in range(end_idx, -1, -1):
        if raw_string[i] == '}':
            brace_count += 1
        elif raw_string[i] == '{':
            brace_count -= 1
        
        # When count hits 0, we've found the parent opening brace
        if brace_count == 0:
            start_idx = i
            break

    if start_idx != -1:
        return raw_string[start_idx : end_idx + 1]
    
    return None

In [23]:
import requests
import json

response = requests.get(
  url="https://openrouter.ai/api/v1/key",
  headers={
    "Authorization": f"Bearer {OPENROUTER_API_KEY}"
  }
)

print(json.dumps(response.json(), indent=2))


{
  "data": {
    "label": "sk-or-v1-b0a...37b",
    "is_management_key": false,
    "is_provisioning_key": false,
    "limit": null,
    "limit_reset": null,
    "limit_remaining": null,
    "include_byok_in_limit": false,
    "usage": 0.084242011,
    "usage_daily": 0.00452404,
    "usage_weekly": 0.084242011,
    "usage_monthly": 0.084242011,
    "byok_usage": 0,
    "byok_usage_daily": 0,
    "byok_usage_weekly": 0,
    "byok_usage_monthly": 0,
    "is_free_tier": true,
    "expires_at": null,
    "rate_limit": {
      "requests": -1,
      "interval": "10s",
      "note": "This field is deprecated and safe to ignore."
    }
  }
}


In [24]:
all_predictions = []
strips = load_strips("dilbert_1989_to_2023")
for strip in strips:
    print('\n\n_________________________________________\nDate:', strip['date'])

    image = strip["image"]
    _, buffer = cv2.imencode(".png", image)
    base64_image = base64.b64encode(buffer.tobytes()).decode("utf-8")

    for _ in range(5): # attempt s
        try:
            response = requests.post(
                "https://openrouter.ai/api/v1/chat/completions",
                headers={
                    "Authorization": f"Bearer {OPENROUTER_API_KEY}",
                    "Content-Type": "application/json"
                },
                json={
                    "model": "nvidia/nemotron-nano-12b-v2-vl",  # change model here for Model B
                    "temperature": 0,
                    "messages": [
                        {
                            "role": "user",
                            "content": [
                                {"type": "text", "text": prompt5},
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": f"data:image/png;base64,{base64_image}"
                                    }
                                }
                            ]
                        }
                    ]
                },
                timeout=120
            )
            time.sleep(4)
        except Exception as e:
            print("Request exception:", e)
            continue

        if response.status_code != 200:
            print("Request failed:", response.text)
            continue
        
        break

    response_json = response.json()

    try:
        model_output = response_json["choices"][0]["message"]["content"]
    except (KeyError, IndexError):
        print("Malformed response:", response_json)
        continue

    model_output = model_output.strip()

    # Remove markdown wrapping if present
    # Remove leading ```json blocks
    if model_output.startswith("```"):
        parts = model_output.split("```")
        if len(parts) >= 2:
            model_output = parts[1].strip()

    # Remove leading "json" label/ explanation s if present
    model_output = extract_balanced_json(model_output)

    if model_output is None:
        print("Could not find valid JSON block")
        continue

    try:
        parsed_output = json.loads(model_output)
    except json.JSONDecodeError:
        print("JSON parsing failed for:", strip["date"])
        print("Raw output:", model_output)
        continue

    print(parsed_output)

    all_predictions.append({
        "date": strip["date"],
        "prediction": parsed_output
    })

with open("nvidia_nemotron_12b_direct.json", "w", encoding="utf-8") as f:
    json.dump(all_predictions, f, indent=2)

print("\n✅ Saved predictions to nvidia_nemotron_12b_direct.json")



_________________________________________
Date: 1990-01-01
{'panels': [{'panel_number': 1, 'dialogue': [{'speaker': 'Dilbert', 'text': "I'M GRUMPY TODAY, SO DON'T EVEN TRY TO TALK TO ME."}, {'speaker': 'Dogbert', 'text': ''}]}, {'panel_number': 2, 'dialogue': [{'speaker': 'Dogbert', 'text': "AND DON'T TRY TO FLATTER ME OR GIVE ME CHOCOLATE CAKE TO MAKE ME FEEL BETTER."}, {'speaker': 'Dilbert', 'text': ''}]}, {'panel_number': 3, 'dialogue': [{'speaker': 'Dilbert', 'text': "AND I GUESS I SHOULDN'T SCRATCH YOU BEHIND THE EARS UNTIL YOU HAVE LITTLE LEG SPASMS."}, {'speaker': 'Dogbert', 'text': 'RIGHT. NONE OF THAT.'}]}]}


_________________________________________
Date: 1990-01-02
{'panels': [{'panel_number': 1, 'dialogue': [{'speaker': 'Dogbert', 'text': "I'M STARTING TO WRITE AN UNAUTHORIZED BIOGRAPHY ABOUT YOU."}]}, {'panel_number': 2, 'dialogue': [{'speaker': 'Dogbert', 'text': 'IT\'S KIND OF A "PET AND TELL" EXPOSÉ FULL OF STARTLING REVELATIONS.'}]}, {'panel_number': 3, 'dialogue': 