In [None]:
!cd pdfs && unzip -qq pdfs.zip
!ls pdfs

In [None]:
!nvidia-smi

In [None]:
!pip install -U ms-swift

In [None]:
!pip install flash-attn --no-build-isolation

In [None]:
!pip install decord

In [None]:
!pip install pypdfium2 qwen-vl-utils

In [None]:
!pip install vllm

In [None]:
!pip install "numpy<2"

In [None]:
from pypdfium2 import PdfDocument


def convert_pdf_to_images(pdf_path, scale=1.0):
    pdf_document = PdfDocument(pdf_path)
    pages = []
    for page_index in range(len(pdf_document)):
        page = pdf_document[page_index]
        pil_image = page.render(scale=scale).to_pil()
        pages.append(pil_image)
        page.close()

    pdf_document.close()

    return pages


# HF

In [None]:
!pip install transformers==4.57.3

In [None]:
import torch
from transformers import AutoProcessor, AutoTokenizer, AutoModelForCausalLM

model_path = 'baidu/ERNIE-4.5-VL-28B-A3B-PT'
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    dtype="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
processor.eval()
model.add_image_preprocess(processor)


In [None]:
import json
from pathlib import Path
from tqdm import tqdm

SYSTEM_PROMPT = "You are helpful assistant."

INSTRUCTIONS = """Covetr this page image to specific markdown

---

1) Output format

- Output must be Markdown, ordered in natural human reading order.
- Use '#' to represent heading levels ('#', '##', '###', etc). Keep titles and section headers with correct hierarchy.
- Preserve paragraphs as continuous text blocks.
- Use '-' for bulleted lists.
- Use '1.', '2.', '3.' for numbered lists.
- Tables MUST be represented strictly in HTML.
- Preserve original text and number formatting exactly.
- Ignore page headers and footers unless semantically meaningful.

---

2) Universal image region tagging (MANDATORY)

- For EVERY image-like region (drawings, photos, charts, diagrams, stamps, logos):
  - Output exactly:
    <img data-bbox="x1 y1 x2 y2">DESCRIPTION</img>
- Use a nearby caption if present; otherwise a short factual description.
- Place <img> tags in reading order.

---

Answer only with extracted content in described format without any comments and additions
"""


def build_messages() -> list[dict]:
    return [
        {
            "role": "system",
            "content": [
                {"type": "text", "text": SYSTEM_PROMPT},
            ],
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": "tmp.png",
                },
                {"type": "text", "text": INSTRUCTIONS},
            ],
        }
    ]


In [None]:
images_dir = Path('page_images')
dataset_path = Path('dataset.jsonl')

records = []
for pdf_path in tqdm(Path('pdfs').glob('*.pdf')):
    pages = convert_pdf_to_images(pdf_path, scale=1.0)
    for i, p in enumerate(pages):
        sample_path = images_dir / pdf_path.stem
        image_path = sample_path / f'{i}.png'
        sample_path.mkdir(parents=True, exist_ok=True)
        page_w, page_h = p.size
        p.save('tmp.png')

        messages = build_messages()
        text = processor.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
        )
        image_inputs, video_inputs = processor.process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )

        # Inference: Generation of the output
        device = next(model.parameters()).device
        inputs = inputs.to(device)

        generated_ids = model.generate(
            inputs=inputs['input_ids'].to(device),
            **inputs,
            max_new_tokens=5000
            )
        output_text = processor.decode(generated_ids[0])

        record = {
            "page_w": page_w,
            "page_h": page_h,
            "page": i,
            "messages": messages,
            "images": [image_path.absolute().as_posix()],
            "response": output_text
        }
        records.append(record)

with open("resilts-ernie.jsonl", "w", encoding="utf-8") as handle:
    for record in records:
        handle.write(json.dumps(record, ensure_ascii=False) + "\n")


# MS-SWIFT

In [None]:
import json
from pathlib import Path

SYSTEM_PROMPT = "You are helpful assistant."

INSTRUCTIONS = """Covetr this page image to specific markdown

---

1) Output format

- Output must be Markdown, ordered in natural human reading order.
- Use '#' to represent heading levels ('#', '##', '###', etc). Keep titles and section headers with correct hierarchy.
- Preserve paragraphs as continuous text blocks.
- Use '-' for bulleted lists.
- Use '1.', '2.', '3.' for numbered lists.
- Tables MUST be represented strictly in HTML.
- Preserve original text and number formatting exactly.
- Ignore page headers and footers unless semantically meaningful.

---

2) Universal image region tagging (MANDATORY)

- For EVERY image-like region (drawings, photos, charts, diagrams, stamps, logos):
  - Output exactly:
    <img data-bbox="x1 y1 x2 y2">DESCRIPTION</img>
- Use a nearby caption if present; otherwise a short factual description.
- Place <img> tags in reading order.

---

3) ENGINEERING DRAWINGS

If an engineering drawing or technical scheme is present, add drawing metadata:

Detailed description
- 1-2 sentences to describe drawing content

Pictures:
- Provide drawing bbox: [x1, y1, x2, y2] for all views in "bbox_2d" field with view name

Legend (if present):
- write it in JSON format

Dimensions (if present):
- write it in JSON format
- map dimention values to elemnt name

For example:
{
  "description": "Чертеж описывает дисковый поворотный затвор с ручным редуктором и удлинением штока.
  Наименование: Затвор дисковый поворотный DN200 PN1,0 МПа.
  Модель/Код: КМЗ-СА-ЗД-12-0200-11.
  Рабочее давление: PN 1,0 МПа.
  Температурный режим: В тексте указано «от 60 до 100 °С» , однако наличие исполнения УХЛ3 и использование стали 09Г2С обычно подразумевает работу при температурах от -60°С.
  Герметичность: Класс «А» по ГОСТ 9544-2015.
  Климатическое исполнение: УХЛ3 (для умеренного и холодного климата).
  Управление: Ручной редуктор.
  Комплектация: Поставляется в комплекте с ответными фланцами, прокладками и крепежом.
  Стандарты:
  Строительная длина: ASME B16.10.
  Конструкция соответствует стандартам ASME и EN 593.
  Испытания на огнестойкость: ISO 5208.
  ТУ: 28.14.11-001-88287357-2024."
  "views": [
    {"bbox_2d": [74, 361, 324, 808], "name": "front"}
  ],
  "legend": {
    "1":  "Корпус  09Г2С (Конструкционная низколегированная сталь)",
    "2":  "Седло  09Г2С",
    "3":  "Диск  A182 F304 (Нержавеющая сталь)",
    "4":  "Вал  A182 F316 (Нержавеющая сталь)",
    "5":  "Уплотнение вала  Графит",
    "6":  "Крышка корпуса  09Г2С",
    "7":  "Болт  A193 B8"
  },
  "dimensions": {
    "DN (Номинальный диаметр)": "200 мм",
    "L (Строительная длина)": "88,5 мм",
    "H (Общая высота)": "1440 мм",
    "H1 (Высота до оси/центра)": "726 мм",
    "Высота удлинительной колонны": "810 мм"
  }
}
---

Answer only with extracted content in described format without any comments and additions
"""


def build_messages() -> list[dict]:
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": INSTRUCTIONS + "<image>"},
    ]


images_dir = Path('page_images')
dataset_path = Path('dataset.jsonl')

records = []
for pdf_path in Path('pdfs').glob('*.pdf'):
    pages = convert_pdf_to_images(pdf_path, scale=1.0)
    for i, p in enumerate(pages):
        sample_path = images_dir / pdf_path.stem
        image_path = sample_path / f'{i}.png'
        sample_path.mkdir(parents=True, exist_ok=True)
        page_w, page_h = p.size
        p.save(image_path)

        record = {
            "page_w": page_w,
            "page_h": page_h,
            "page": i,
            "messages": build_messages(),
            "images": [{"path": image_path.absolute().as_posix()}],
        }
        records.append(record)

with dataset_path.open("w", encoding="utf-8") as handle:
    for record in records:
        handle.write(json.dumps(record, ensure_ascii=False) + "\n")


In [None]:
models = {
    "qwen": {
        "chkp": "Qwen/Qwen3-VL-30B-A3B-Instruct",
        "reqs": 'pip install transformers==4.57.3',
        "hf": True,
        "attn": "flash_attn",
        "backend": "vllm"
    },
    "glm": {
        "chkp": "zai-org/GLM-4.6V-Flash",
        "reqs": 'pip install "transformers>=5.0.0rc0"',
        "hf": True,
        "attn": "flash_attn",
        "backend": "vllm"
    },
    "internvl": {
        "chkp": "OpenGVLab/InternVL3-14B-Instruct",
        "reqs": 'pip install transformers==4.57.3',
        "hf": True,
        "attn": "flash_attn",
        "backend": "vllm"
    },
    "mistral": {
        "chkp": "mistralai/Ministral-3-14B-Instruct-2512-BF16",
        "reqs": 'pip install "transformers>=5.0.0.dev0" "mistral-common>=1.8.6"',
        "hf": True,
        "attn": "flash_attn",
        "backend": "pt"
    },
    "gemma": {
        "chkp": "LLM-Research/gemma-3-27b-it",
        "reqs": 'pip install transformers==4.57.3',
        "hf": False,
        "attn": "flash_attn",
        "backend": "vllm"
    }
}


In [None]:
for model_name, meta in models.items():
    chkp = meta["chkp"]
    use_hf = meta["hf"]
    reqs = meta["reqs"]
    attn = meta["attn"]
    backend = meta["backend"]
    if backend == "vllm":
        print(f'''!{reqs} && CUDA_VISIBLE_DEVICES=0 \
        VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
        swift infer \
            --model {chkp} \
            --infer_backend vllm \
            --temperature 0.3 \
            --num_beams 2 \
            --attn_impl {attn} \
            --vllm_gpu_memory_utilization 0.9 \
            --vllm_max_model_len 32768 \
            --val_dataset {dataset_path.absolute().as_posix()} \
            --max_new_tokens 5000 \
            --result_path results-{model_name}.jsonl \
            {"--use_hf=1" if use_hf else ""}
        ''')
    elif backend == "pt":
        print(f'''!{reqs} && CUDA_VISIBLE_DEVICES=0 \
        swift infer \
            --model {chkp} \
            --torch_dtype bfloat16 \
            --infer_backend pt \
            --max_batch_size 16 \
            --temperature 0.3 \
            --num_beams 2 \
            --attn_impl {attn} \
            --val_dataset {dataset_path.absolute().as_posix()} \
            --max_new_tokens 5000 \
            --result_path results-{model_name}.jsonl \
            {"--use_hf=1" if use_hf else ""}
        ''')


# Postprocess

In [None]:
import json
import os
import re
from collections import defaultdict
from pathlib import Path
from PIL import Image


In [None]:
def replace_img_tags_with_cropped_images(
    markdown_content: str,
    page_image_path: str | Path,
    page_num: int,
    output_path: Path,
    normalize_factor: float | None = None
) -> str:
    """
    Replace <img ... data-bbox="x1 y1 x2 y2">desc</img>
    with ![desc](./images/filename.png) and save cropped images.
    """

    output_path = Path(output_path)
    cropped_dir = output_path / "images"
    cropped_dir.mkdir(parents=True, exist_ok=True)

    page_image_path = Path(page_image_path)

    # Flexible matching: any attributes, any whitespace/newlines in inner text
    img_pattern = re.compile(
        r'<img\b[^>]*\bdata-bbox="([^"]+)"[^>]*>(.*?)</img>',
        re.IGNORECASE | re.DOTALL
    )

    img_counter = 0

    def replace_img_tag(match: re.Match) -> str:
        nonlocal img_counter
        bbox_str = match.group(1).strip()
        description = re.sub(r"\s+", " ", match.group(2).strip())

        # Parse bbox coordinates
        try:
            parts = re.split(r"[,\s]+", bbox_str)
            bbox_values = [float(p) for p in parts if p]
            if len(bbox_values) != 4:
                return match.group(0)

            x1, y1, x2, y2 = bbox_values

            if not page_image_path.exists():
                # Can't crop if file not found
                return match.group(0)

            with Image.open(page_image_path) as img:
                width, height = img.size

                if normalize_factor is not None:
                    x1, y1, x2, y2 = [c / normalize_factor for c in (x1, y1, x2, y2)]

                # normalized 0..1 coords
                if max(x1, y1, x2, y2) <= 1.0:
                    x1, x2 = x1 * width, x2 * width
                    y1, y2 = y1 * height, y2 * height

                # Clamp and convert
                x1i, y1i = max(0, int(round(x1))), max(0, int(round(y1)))
                x2i, y2i = min(width, int(round(x2))), min(height, int(round(y2)))

                if x2i <= x1i or y2i <= y1i:
                    return match.group(0)

                cropped_img = img.crop((x1i, y1i, x2i, y2i))

                img_counter += 1
                crop_filename = f"page_{page_num:03d}_img_{img_counter:02d}.png"
                crop_path = cropped_dir / crop_filename
                cropped_img.save(crop_path)

                rel_path = f"./images/{crop_filename}"
                return f"![{description}]({rel_path})"

        except Exception:
            return match.group(0)

    return img_pattern.sub(replace_img_tag, markdown_content)


In [None]:
import json
import os
from collections import defaultdict


def clean_markdown_response(response):
    # GLM response
    start_i = response.find('<|begin') + 16
    end_i = response.find('<|end')
    response = response[start_i: end_i]

    response = response.strip()
    if response.startswith("```markdown"):
        response = response[11:].strip()
    if response.startswith("```"):
        response = response[3:].strip()
    if response.endswith("```"):
        response = response[:-3].strip()
    return response


def process_jsonl_file(jsonl_path, outpu_dir, normalize_factor):
    documents = defaultdict(list)

    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            record = json.loads(line.strip())
            response = record["response"]

            if not response:
                continue

            cleaned_response = clean_markdown_response(response)

            images = record["images"]
            if not images:
                continue

            image_path = images[0]["path"]
            if not image_path:
                continue

            # Parse path structure
            pdf_file_stem, page_filename = image_path.split('/')[-2:]
            page_num = int(page_filename.replace('.png', ''))

            # Process image tags in the markdown
            processed_response = replace_img_tags_with_cropped_images(
                cleaned_response,
                image_path,
                page_num,
                output_path=outpu_dir / pdf_file_stem,
                normalize_factor=normalize_factor
            )
            documents[pdf_file_stem].append({
                'page_num': page_num,
                'content': processed_response
            })

    # Create markdown files
    for pdf_file_stem, pages in documents.items():
        pages.sort(key=lambda x: x['page_num'])
        full_markdown = "\n\n".join(page['content'] for page in pages)
        md_dir = outpu_dir / pdf_file_stem
        md_dir.mkdir(parents=True, exist_ok=True)
        md_filename = md_dir / f"{pdf_file_stem}.md"

        with open(md_filename, 'w', encoding='utf-8') as md_file:
            md_file.write(full_markdown)


In [None]:
for model_name, meta in models.items():
    print(model_name)
    jsonl_file = Path(f"results-{model_name}.jsonl")
    outpu_dir = Path(model_name)
    normalize_factor = 1000

    process_jsonl_file(jsonl_file, outpu_dir, normalize_factor)
    !zip -qq -r {model_name}.zip {model_name}
