In [None]:
from transformers import AutoProcessor, AutoModelForVision2Seq
import torch

MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"

model = AutoModelForVision2Seq.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    trust_remote_code=True
)


In [3]:
import cv2
import numpy as np
from pathlib import Path
from PIL import Image

def load_image(path):
    img = Image.open(path).convert("RGB")
    return img

def load_strips(folder_path):
    folder_path = Path(folder_path)

    for sub_dir in sorted(folder_path.iterdir()):
        if not sub_dir.is_dir():
            continue
        for strip in sorted(sub_dir.iterdir()):
            date = strip.stem[:10]
            name = strip.stem[11:]
            #print(name, date)

            yield {
                "date": date,
                "name": name,
                "image": load_image(strip)
            }

In [4]:
def lines_to_spaces(white_lines):
    spaces = []
    if not white_lines:
        return []
    
    space = [white_lines[0], white_lines[0]]
    for i in range(1,len(white_lines)):
        if white_lines[i] == white_lines[i-1] + 1:
            space[1] = white_lines[i]
        else:
            spaces.append(space)
            space = [white_lines[i], white_lines[i]]

    spaces.append(space)
    return spaces

def strip_cropper(spaces, dim_length):
    splits = [0]
    for space in spaces:
        splits.append((space[0] + space[1])//2)
    splits.append(dim_length - 1)
    
    sub_strips = []
    max_dim = max([splits[i] - splits[i-1] for i in range(1, len(splits))])
    for i in range(1, len(splits)):
        if splits[i] - splits[i-1] > 0.3*max_dim:
            sub_strips.append([splits[i-1], splits[i]])

    return sub_strips

def horizontal_splitter(img):
    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    h, w = img_gray.shape
    white_lines = []

    for i in range(h):
        white_pixels = 0
        for j in range(w):
            if img_gray[i][j] > 220:
                white_pixels += 1
        if white_pixels/w > 0.997:
            white_lines.append(i)
        
    blank_regions = lines_to_spaces(white_lines)
    sub_strips = strip_cropper(blank_regions, h)

    return [img[x[0]:x[1],:] for x in sub_strips]

    
def vertical_splitter(img):
    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    h, w = img_gray.shape
    white_lines = []

    for j in range(w):
        white_pixels = 0
        for i in list(range(3*h//20+1)) + list(range(19*h//20, h)):
            if img_gray[i][j] > 230:
                white_pixels += 1
        if white_pixels/(0.20*h) > 0.98:
            white_lines.append(j)     
    blank_regions = lines_to_spaces(white_lines)
    panels = strip_cropper(blank_regions, w)

    return [img[:,x[0]:x[1]] for x in panels]


def panelizer(strip):
    panels = []
    pillow_img = strip["image"]
    np_img = np.array(pillow_img)
    image = cv2.cvtColor(np_img, cv2.COLOR_RGB2BGR)
    
    sub_strips = horizontal_splitter(image)
    
    for sub_strip in sub_strips:
        panels = panels + vertical_splitter(sub_strip)
        
    return panels

In [44]:
prompt2 = """Role: You are a STRICT comic panel transcription engine.

Context:
- This image is a SINGLE comic panel.
- The panel boundaries are already correct.
- Your job is ONLY to transcribe spoken dialogue in this panel.

CHARACTER DEFINITIONS:
- DILBERT: A man wearing a white shirt and a red/black tie.
- DOGBERT: A small white egg-shaped cartoon dog.
- BACKGROUND CHARACTER: Any other character OR any speaker that cannot be identified with certainty.

TASK:
Transcribe ONLY dialogue that is SPOKEN ALOUD.

ABSOLUTE RULES (CRITICAL):
- ONLY transcribe text that appears inside a speech bubble.
- ONLY assign a speaker after you CLEARLY and UNAMBIGUOUSLY detect which character the speech text tail points to.
- NEVER guess the speaker.
- If there is ANY ambiguity about who is speaking, use:
  BACKGROUND CHARACTER

DO NOT transcribe:
- Titles
- Captions
- Sound effects
- Background text
- Thoughts or narration
- Any text NOT inside a speech bubble

OUTPUT RULES:
- Output one line per spoken dialogue.
- Use EXACT casing and punctuation.
- Preserve line breaks inside the bubble if present.
- Do NOT merge dialogue from different speakers.

OUTPUT FORMAT (STRICT):
[Character]: [Spoken text]

If the panel contains NO spoken dialogue, output EXACTLY:
<No conversation>
"""

prompt = """Role: You are a STRICT comic panel transcription engine.

Context:
- This image is a SINGLE comic panel.
- The panel boundaries are already correct.
- Your job is ONLY to transcribe spoken dialogue in this panel.

CHARACTER DEFINITIONS:
- DILBERT: A man wearing a white shirt and a red/black tie.
- DOGBERT: A small white dog with an egg-shaped head.
- BACKGROUND CHARACTER: Any other character OR any speaker that cannot be identified with certainty.

TASK:
Transcribe ONLY dialogue that is SPOKEN ALOUD.

ABSOLUTE RULES (CRITICAL):
- ONLY transcribe text that appears inside a speech bubble.
- ONLY assign a speaker if the speech bubble tail CLEARLY and UNAMBIGUOUSLY points to that character.
- NEVER guess the speaker.
- If there is ANY ambiguity about who is speaking, use:
  BACKGROUND CHARACTER

DO NOT transcribe:
- Titles
- Captions
- Sound effects
- Background text
- Thoughts or narration
- Any text NOT inside a speech bubble

OUTPUT RULES:
- Output one line per spoken dialogue.
- Use EXACT casing and punctuation.
- Preserve line breaks inside the bubble if present.
- Do NOT merge dialogue from different speakers.

OUTPUT FORMAT (STRICT):
[Character]: [Spoken text]

If the panel contains NO spoken dialogue, output EXACTLY:
<No conversation>
"""

In [45]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {
                "type": "text",
                "text": prompt2
            },
        ],
    }
]

text = processor.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)


In [None]:
open("qwen7b_processed.txt", "w").close()

strips = load_strips("dilbert_1989_to_2023")
for strip in strips:
    print('\n\n_________________________________________\nDate:', strip['date'])
    with open("qwen7b_processed.txt", "a", encoding="utf-8") as f:
        f.write('\n\n_________________________________________\nDate: ' + strip['date'] + '\n')
    image = strip["image"]

    panels = panelizer(strip)
    for i, panel in enumerate(panels):
        print("\nPanel", str(i+1) + ':')
        panel_image = cv2.cvtColor(panel, cv2.COLOR_BGR2RGB)
        panel_image = Image.fromarray(panel_image)
        inputs = processor(
        text=text,
        images=panel_image,
        return_tensors="pt"
        )
        
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
        with torch.no_grad():
            output_ids = model.generate(
                **inputs,
                max_new_tokens=256,
                do_sample=False,
            )
        
        raw = processor.batch_decode(
        output_ids,
        skip_special_tokens=True
        )[0]
        
        if "assistant" in raw:
            raw = raw.rsplit("assistant", 1)[-1]
        
        result = raw.strip()
        print(result)

        with open("qwen7b_processed.txt", "a", encoding="utf-8") as f:
            f.write('\nPanel ' + str(i+1) + ':\n' + result + '\n')



_________________________________________
Date: 1990-01-01

Panel 1:
DILBERT: I'M GRUMPY TODAY, SO DON'T EVEN TRY TO TALK TO ME.

Panel 2:
DOGBERT: AND DON'T TRY TO FLATTER ME OR GIVE ME CHOCOLATE CAKE TO MAKE ME FEEL BETTER.

Panel 3:
DILBERT: AND I GUESS I SHOULDN'T SCRATCH YOU BEHIND THE EARS UNTIL YOU HAVE LITTLE LEG SPASMS.
DOGBERT: RIGHT. NONE OF THAT.


_________________________________________
Date: 1990-01-02

Panel 1:
DILBERT: I'M STARTING TO WRITE AN UNAUTHORIZED BIOGRAPHY ABOUT YOU.

Panel 2:
DOGBERT: IT'S KIND OF A "PET AND TELL" EXPOSÉ FULL OF STARTLING REVELATIONS.

Panel 3:
DILBERT: WHO WOULD BE STARTLED BY MY LIFE?
DOGBERT: I THINK YOU WILL BE.


_________________________________________
Date: 1990-01-03

Panel 1:
DILBERT: ARE YOU REALLY GOING THROUGH WITH THE UNAUTHORIZED BIOGRAPHY OF ME?
DOGBERT: YES.

Panel 2:
DOGBERT: I'M UP TO THE PART WHERE JACKIE "O" AND LIZ TAYLOR FIGHT A DUEL FOR YOUR LOVE.

Panel 3:
BACKGROUND CHARACTER: TRAGICALLY, NEITHER ARE AWARE THAT Y