In [None]:
# !git clone https://github.com/Tencent-Hunyuan/HunyuanOCR.git

In [None]:
# %cd HunyuanOCR
# !pip install vllm>=0.12.0
# !pip install -r requirements.txt

In [None]:
from vllm import LLM, SamplingParams
from PIL import Image
from transformers import AutoProcessor


def clean_repeated_substrings(text):
    """Clean repeated substrings in text"""
    n = len(text)
    if n<8000:
        return text
    for length in range(2, n // 10 + 1):
        candidate = text[-length:]
        count = 0
        i = n - length

        while i >= 0 and text[i:i + length] == candidate:
            count += 1
            i -= length

        if count >= 10:
            return text[:n - length * (count - 1)]

    return text


model_path = "tencent/HunyuanOCR"
llm = LLM(model=model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path)
sampling_params = SamplingParams(
    temperature=0,
    max_tokens=3000,
    top_k=1,
    repetition_penalty=1.0
)


In [None]:
!pip install pypdfium2

In [None]:
import os
import re
import tempfile
from pathlib import Path
from typing import List, Optional

import torch
from PIL import Image
from tqdm import tqdm

from pypdfium2 import PdfDocument


def convert_pdf_to_images(pdf_path: Path, scale: float = 2.0) -> List[Image.Image]:
    if not pdf_path.exists():
        return []

    pdf_document = PdfDocument(pdf_path.as_posix())
    pages: List[Image.Image] = []
    for page_index in range(len(pdf_document)):
        page = pdf_document[page_index]
        pil_image = page.render(scale=scale).to_pil()
        pages.append(pil_image)
        page.close()
    pdf_document.close()
    return pages


def _device() -> str:
    return "cuda" if torch.cuda.is_available() else "cpu"


def _ensure_bf16_supported_or_fallback(dtype_preferred=torch.bfloat16):
    # BF16 is ideal on Ampere+; fallback to FP16 on older GPUs or CPU.
    if _device() == "cpu":
        return torch.float32
    # Most CUDA GPUs can do fp16; bf16 depends.
    try:
        _ = torch.tensor([1.0], device="cuda", dtype=torch.bfloat16)
        return torch.bfloat16
    except Exception:
        return torch.float16


def _join_pages_as_md(page_texts: List[str]) -> str:
    # Simple join with page separators (safe default).
    out = []
    for i, t in enumerate(page_texts, start=1):
        t = t.strip()
        if not t:
            continue
        out.append(f"\n\n---\n\n<!-- Page {i} -->\n\n{t}")
    return "".join(out).lstrip()


def _save_pil_to_temp_png(img: Image.Image, tmpdir: Path, page_index: int) -> Path:
    p = tmpdir / f"page_{page_index:05d}.png"
    img.save(p, format="PNG")
    return p


In [None]:
def pdf_to_md_hy_ocr(
    pdf_path: Path,
    scale: float = 2,
    max_new_tokens: int = 5000,
) -> str:
    pages = convert_pdf_to_images(pdf_path, scale=scale)
    task = "Extract all information from the main body of the document image and represent it in markdown format, ignoring headers and footers. Tables should be expressed in HTML format, formulas in the document should be represented using LaTeX format, and the parsing should be organized according to the reading order."

    page_texts = []
    for img in tqdm(pages, desc=pdf_path.stem):
        img.save('page.png')
        messages = [
            {"role": "system", "content": ""},
            {"role": "user", "content": [
                {"type": "image", "image": 'page.png'},
                {"type": "text", "text": task}
            ]}
        ]
        prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = {"prompt": prompt, "multi_modal_data": {"image": [img]}}
        output = llm.generate(
            [inputs],
            sampling_params
        )[0]
        out_text = clean_repeated_substrings(output.outputs[0].text)

        page_texts.append(out_text.strip())

    return _join_pages_as_md(page_texts)



In [None]:
output_path = Path("hy_ocr")
output_path.mkdir(parents=True, exist_ok=True)
for pdf_path in Path('pdfs').glob('*.pdf'):
    md_result = pdf_to_md_hy_ocr(
        pdf_path,
        scale=2
    )
    with open(output_path / pdf_path.with_suffix('.md').name, 'w') as f:
        f.write(md_result)


In [None]:
!zip -r -qq hy_ocr.zip hy_ocr