In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch

MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"

model = AutoModelForVision2Seq.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    trust_remote_code=True
)


In [3]:
import cv2
import numpy as np
from pathlib import Path
from PIL import Image

def load_image(path):
    img = Image.open(path).convert("RGB")
    return img

def load_strips(folder_path):
    folder_path = Path(folder_path)

    for sub_dir in sorted(folder_path.iterdir()):
        if not sub_dir.is_dir():
            continue
        for strip in sorted(sub_dir.iterdir()):
            date = strip.stem[:10]
            name = strip.stem[11:]
            #print(name, date)

            yield {
                "date": date,
                "name": name,
                "image": load_image(strip)
            }

In [4]:
prompt = """You are a specialized comic transcription engine. 

**SCANNING INSTRUCTIONS:**
1. Scan the strip from left to right. 
2. Identify the vertical dividing lines (borders) between panels.
3. Every time you cross a border, start a new "PANEL [N]" block.

**IDENTIFICATION GUIDE:**
- DILBERT: Man in white shirt and red/black tie.
- DOGBERT: The small white dog.
- BACKGROUND CHARACTER: Any other character other than Dilbert or Dogbert
- TRACING: There is always a little line next to the text that points to the character speaking. Carefully track/follow the line to the character and identify the character each time.
**OUTPUT FORMAT (STRICT):**
PANEL 1:
[Character Name]: [Text]

PANEL 2:
[Character Name]: [Text]

**STRICT RULES:**
- Do NOT output reasoning, thought tags, or summaries.
- Transcribe text verbatim.
- If a panel is empty, write "(no text)".

Remember there is a thick white region that seperates each panel in the image. Also, remember some panels can have more than 1 character speaking. Some panels can have none.
Say <no character found speaking> for such panels.
Be careful while seperating panels.
"""

In [5]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {
                "type": "text",
                "text": prompt
            },
        ],
    }
]

text = processor.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)


In [None]:
strips = load_strips("dilbert_1989_to_2023")
for strip in strips:
    print('\n\n_________________________________________\nDate:', strip['date'])
    image = strip["image"]

    inputs = processor(
    text=text,
    images=image,
    return_tensors="pt"
    )
    
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=False,
        )
    
    raw = processor.batch_decode(
    output_ids,
    skip_special_tokens=True
    )[0]
    
    if "assistant" in raw:
        raw = raw.rsplit("assistant", 1)[-1]
    
    result = raw.strip()
    print(result)

    with open("qwen7b_direct.txt", "a", encoding="utf-8") as f:
        f.write('\n\n_________________________________________\nDate: ' + strip['date'] + '\n' + result + '\n')

    f.close()



_________________________________________
Date: 1990-01-01
PANEL 1:
DILBERT: I'M GRUMPY TODAY, SO DON'T EVEN TRY TO TALK TO ME.

PANEL 2:
DOGBERT: AND DON'T TRY TO FLATTER ME OR GIVE ME CHOCOLATE CAKE TO MAKE ME FEEL BETTER.

PANEL 3:
DILBERT: AND I GUESS I SHOULDN'T SCRATCH YOU BEHIND THE EARS UNTIL YOU HAVE LITTLE LEG SPASMS.
DOGBERT: RIGHT. NONE OF THAT.


_________________________________________
Date: 1990-01-02
PANEL 1:
DILBERT: I'M STARTING TO WRITE AN UNAUTHORIZED BIOGRAPHY ABOUT YOU.

PANEL 2:
DOGBERT: IT'S KIND OF A "PET AND TELL" EXPOSÉ FULL OF STARTLING REVELATIONS.

PANEL 3:
DILBERT: WHO WOULD BE STARTLED BY MY LIFE?
DOGBERT: I THINK YOU WILL BE.


_________________________________________
Date: 1990-01-03
PANEL 1:
DILBERT: ARE YOU REALLY GOING THROUGH WITH THE UNAUTHORIZED BIOGRAPHY OF ME?

PANEL 2:
DOGBERT: YES.

PANEL 3:
DOGBERT: I'M UP TO THE PART WHERE JACKIE "O" AND LIZ TAYLOR FIGHT A DUEL FOR YOUR LOVE.

PANEL 4:
DOGBERT: TRAGICALLY, NEITHER ARE AWARE THAT YOU'RE 