In [1]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import os
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv("../.env")
homedir = os.getenv("HOMEDIR")
datadir = os.getenv("DATADIR")
os.chdir(homedir)

In [3]:
listfiles = os.listdir(os.path.join(datadir, "example_files"))
file_path = os.path.join(datadir, "example_files", listfiles[0])

In [None]:
# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-7B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

In [None]:
html_prompt = """
Task:

Extract all the text content from the given image and format it in HTML.
Your goal is to preserve both the content and layout of the text as accurately as possible.


Extraction Rules:

    1. General HTML Structure:
    Wrap the entire output in a <html> document structure:
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Extracted Text</title>
    </head>
    <body>
        <!-- Content goes here -->
    </body>
    </html>

    2. Text Formatting:
    Use appropriate HTML tags for text elements:
        - Headings: <h1>, <h2>, <h3>, etc., for titles and sections.
        - Paragraphs: Wrap regular text in <p> tags.
        - Lists:
            - Use <ul> and <li> for unordered (bulleted) lists.
            - Use <ol> and <li> for ordered (numbered) lists.
        - Emphasis:
            - Bold text: <strong>
            - Italic text: <em>
        - Line breaks: Use <br> for single line breaks where necessary.

    3. Tables:
    Use proper HTML table structure for tabular content:
    <table>
        <thead>
            <tr>
                <th>Column 1</th>
                <th>Column 2</th>
            </tr>
        </thead>
        <tbody>
            <tr>
                <td>Row 1 Col 1</td>
                <td>Row 1 Col 2</td>
            </tr>
            <tr>
                <td>Row 2 Col 1</td>
                <td>Row 2 Col 2</td>
            </tr>
        </tbody>
    </table>

    4. Layout and Hierarchy Preservation:
    - Reproduce the original text structure and hierarchy (headings, subheadings, paragraphs, lists, and tables) as accurately as possible.
    - Retain visual structure, such as spacing and breaks, using <div> or <br> when appropriate.
    - Use nested HTML tags for lists, sublists, or complex structures.

    5. Handling Special Cases:
    - If any part of the image text is illegible, replace it with: <span>[illegible text]</span>
    - Do not add any additional comments or annotations that are not in the original image.
"""

In [None]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": file_path,
            },
            {
                "type": "text",
                "text": html_prompt},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=2000)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)