# MinerU

In [None]:
!pip install --upgrade pip
!pip install uv
!uv pip install -U "mineru[core,vllm]"

In [None]:
!mineru -p pdfs -o mineru_ppl -l cyrillic -b pipeline


In [None]:
!zip -r -qq mineru_ppl.zip mineru_ppl

# MinerU-VLM

In [None]:
!mineru -p pdfs -o mineru_vlm -l cyrillic -b vlm-transformers


In [None]:
!zip -r -qq mineru_vlm.zip mineru_vlm

# Docling

In [None]:
!pip install docling[vlm,easyocr]==2.65.0

In [None]:
from pathlib import Path
import pandas as pd

from docling_core.types.doc import ImageRefMode, TextItem, PictureItem, TableItem, ListItem
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions, TableFormerMode
from docling.datamodel.settings import settings

CAPTION = "caption"
FOOTNOTE = "footnote"
FORMULA = "formula"
LIST_ITEM = "list_item"
PAGE_FOOTER = "page_footer"
PAGE_HEADER = "page_header"
PICTURE = "picture"
SECTION_HEADER = "section_header"
TABLE = "table"
TEXT = "text"
TITLE = "title"
DOCUMENT_INDEX = "document_index"
CODE = "code"
CHECKBOX_SELECTED = "checkbox_selected"
CHECKBOX_UNSELECTED = "checkbox_unselected"
FORM = "form"
KEY_VALUE_REGION = "key_value_region"


def convert_pdf(input_pdf: str, out_dir: str, device: str = "CUDA"):
    input_pdf = Path(input_pdf)
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    # Accelerator (CPU/MPS/CUDA) example from docs
    accelerator_options = AcceleratorOptions(
        num_threads=8,
        device=getattr(AcceleratorDevice, device),
    )  # :contentReference[oaicite:2]{index=2}

    # Pipeline options (keep images so we can export + reference them)
    opts = PdfPipelineOptions()
    opts.images_scale = 2.0
    opts.generate_page_images = True
    opts.generate_picture_images = True  # :contentReference[oaicite:3]{index=3}
    opts.ocr_options = EasyOcrOptions(lang=['ru', 'en'], download_enabled=True)
    opts.table_structure_options.mode = TableFormerMode.ACCURATE  # use more accurate TableFormer model
    opts.table_structure_options.do_cell_matching = True

    # (Optional) pipeline profiling info
    settings.debug.profile_pipeline_timings = True  # :contentReference[oaicite:4]{index=4}

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=opts,
                accelerator_options=accelerator_options,
            )
        }
    )

    conv_result = converter.convert(input_pdf)
    doc = conv_result.document
    stem = input_pdf.stem

    images_dir = Path(out_dir) / "images"
    images_dir.mkdir(parents=True, exist_ok=True)

    picture_counter = 0
    table_counter = 0
    annotations = []
    idx = 1
    ## Iterate the elements in reading order, including hierachy level:
    for item, level in conv_result.document.iterate_items():
        item_data = {
            'id': idx,
            'page': item.prov[0].page_no
        }
        item_image = item.get_image(conv_result.document)
        # Check item has area
        if not all(item_image.size):
            continue
        # Check item is main content
        if item.label in [PAGE_FOOTER, PAGE_HEADER]:
            continue
        if isinstance(item, TextItem):
            if item.label.lower() in [SECTION_HEADER, TITLE]:
                tag = 'h'
                item_data['hash_count'] = item.level
            elif isinstance(item, ListItem):
                tag = 'li'
            else:
                tag = 'p'
            content = {
                'text': item.text,
                'type': tag
            }

        elif isinstance(item, TableItem) and not isinstance(item, ListItem):
            table_counter += 1
            element_image_filename = images_dir / f"table-{table_counter}.png"
            with element_image_filename.open("wb") as fp:
                item.get_image(conv_result.document).save(fp, "PNG")

            html = item.export_to_html(doc=conv_result.document)
            content = {
                'html': html,
                'type': 'table',
                'src': str(element_image_filename)
            }
        if isinstance(item, PictureItem):
            picture_counter += 1
            w, h = (
                doc.pages[item.prov[0].page_no].size.width,
                doc.pages[item.prov[0].page_no].size.height
            )
            bbox = (
                item.prov[0].bbox.l / w * 1000,
                item.prov[0].bbox.t / h * 1000,
                item.prov[0].bbox.r / w * 1000,
                item.prov[0].bbox.b / h * 1000
            )
            if any([c > 1000 for c in bbox]):
                print('Broken bbox!')

            content = {
                'bbox': list(map(int, bbox)),
                'type': 'img'
            }
        item_data.update(content)
        annotations.append(item_data)
        idx += 1

    markdown_lines = []
    for item in annotations:
        item_type = item['type']
        if item_type == 'h':
            # Get the header level
            header_level = item.get('hash_count', 2)
            if header_level:
                prefix = '#' * header_level  # Markdown header prefix
                markdown_lines.append(f"{prefix} {item.get('text', '')}")
            else:
                markdown_lines.append(item.get('text', ''))
        elif item_type == 'p':
            markdown_lines.append(item.get('text', ''))
        elif item_type == 'li':
            # skip last new line
            try:
                if not markdown_lines[-1]:
                    markdown_lines = markdown_lines[:-1]
            except IndexError:
                pass

            markdown_lines.append(f"- {item.get('text', '')}")
        elif item_type == 'img':
            bbox = " ".join([str(c) for c in item['bbox']])
            tag = f'<img data-bbox="{bbox}"></img>'
            print(tag)
            markdown_lines.append(tag)
        elif item_type == 'table':
            html = item.get('html', '')
            src = item.get('src', '')
            markdown_lines.append(f"![Table]({src})")
            # Include HTML directly
            markdown_lines.append(html)
        else:
            # If  type is unrecognized, skip it
            pass
        # Add empty line after each item for Markdown formatting
        markdown_lines.append('')

    # Join all lines into the final Markdown text
    markdown_text = '\n'.join(markdown_lines)
    with open(out_dir / input_pdf.with_suffix('.md').name, 'w') as f:
        f.write(markdown_text)


In [None]:
from tqdm import tqdm

output_path = Path("docling_ppl")
for pdf_path in tqdm(Path('pdfs').glob('*.pdf')):
    res = convert_pdf(
        pdf_path,
        output_path / pdf_path.stem,
        device="CUDA"
    )
    if res is not None:
        item, conv_result, doc = res
        break


In [None]:
!zip -r -qq docling_ppl.zip docling_ppl

# Docling VLM

In [None]:
from pathlib import Path

from docling.datamodel import vlm_model_specs
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.pipeline_options import VlmPipelineOptions
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline


def convert_pdf_vlm(input_pdf: str, out_dir: str, device: str = "CUDA"):
    input_pdf = Path(input_pdf)
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    pipeline_options = VlmPipelineOptions()
    pipeline_options.vlm_options = vlm_model_specs.GRANITEDOCLING_VLLM
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_cls=VlmPipeline,
                pipeline_options=pipeline_options,
            )
        },
    )

    converter.initialize_pipeline(InputFormat.PDF)

    conv_result = converter.convert(input_pdf)
    doc = conv_result.document
    stem = input_pdf.stem

    with open(out_dir / input_pdf.with_suffix('.md').name, 'w') as f:
        f.write(doc.export_to_markdown())


In [None]:
from tqdm import tqdm

output_path = Path("docling_vlm")
for pdf_path in tqdm(Path('pdfs').glob('*.pdf')):
    convert_pdf_vlm(
        pdf_path,
        output_path / pdf_path.stem,
        device="CUDA"
    )


In [None]:
!zip -r -qq docling_vlm.zip docling_vlm

# PP Structure V3

In [None]:
!pip install paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/

In [None]:
# The following command installs the PaddlePaddle version for CUDA 12.6. For other CUDA versions and the CPU version, please refer to https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html
!pip install paddlepaddle-gpu==3.2.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
!pip install -U "paddleocr[doc-parser]"
# For Linux systems, please directly copy and execute the following commands without modifying the cuda version in the link:
!pip install https://paddle-whl.bj.bcebos.com/nightly/cu126/safetensors/safetensors-0.6.2.dev0-cp38-abi3-linux_x86_64.whl

In [None]:
!pip install https://paddle-whl.bj.bcebos.com/nightly/cu126/safetensors/safetensors-0.6.2.dev0-cp38-abi3-linux_x86_64.whl

In [None]:
# !pip install -U torch
!pip install --force-reinstall torch torchvision torchaudio

In [None]:
import os
from tqdm import tqdm

for i, doc in tqdm(enumerate(os.listdir('pdfs'))):
    stem = doc.replace('.pdf', '')
    print('='*80 + '\n', i, stem, '\n' + '='*70)
    !paddleocr pp_structurev3 -i "./pdfs/{doc}" --save_path "./pp_sv3/{stem}" --device gpu --text_recognition_model_name eslav_PP-OCRv5_mobile_rec


In [None]:
!zip -r -qq pp_sv3.zip pp_sv3

# Monkey-OCR

In [None]:
!git clone https://github.com/Yuliang-Liu/MonkeyOCR.git
%cd MonkeyOCR
!pip install -e .

In [None]:
!python tools/download_model.py -n MonkeyOCR-pro-3B

In [None]:
!pip install "langchain<1"
!pip install lmdeploy==0.9.2

In [None]:
!python parse.py ../pdfs -o ./monkey_ocr

In [None]:
!zip -r -qq monkey_ocr.zip monkey_ocr

# Utils

In [None]:
import os
import re
import tempfile
from pathlib import Path
from typing import List, Optional

import torch
from PIL import Image
from tqdm import tqdm

from pypdfium2 import PdfDocument


def convert_pdf_to_images(pdf_path: Path, scale: float = 2.0) -> List[Image.Image]:
    if not pdf_path.exists():
        return []

    pdf_document = PdfDocument(pdf_path.as_posix())
    pages: List[Image.Image] = []
    for page_index in range(len(pdf_document)):
        page = pdf_document[page_index]
        pil_image = page.render(scale=scale).to_pil()
        pages.append(pil_image)
        page.close()
    pdf_document.close()
    return pages


def _device() -> str:
    return "cuda" if torch.cuda.is_available() else "cpu"


def _ensure_bf16_supported_or_fallback(dtype_preferred=torch.bfloat16):
    # BF16 is ideal on Ampere+; fallback to FP16 on older GPUs or CPU.
    if _device() == "cpu":
        return torch.float32
    # Most CUDA GPUs can do fp16; bf16 depends.
    try:
        _ = torch.tensor([1.0], device="cuda", dtype=torch.bfloat16)
        return torch.bfloat16
    except Exception:
        return torch.float16


def _clean_chat_prefixes(text: str) -> str:
    # Many chat templates return things like "OCR: ..." or role prefixes.
    return re.sub(r"^\s*(assistant|OCR|Table Recognition|Chart Recognition|Formula Recognition)\s*:?\s*",
                  "", text, flags=re.IGNORECASE).strip()


def _join_pages_as_md(page_texts: List[str]) -> str:
    # Simple join with page separators (safe default).
    out = []
    for i, t in enumerate(page_texts, start=1):
        t = t.strip()
        if not t:
            continue
        out.append(f"\n\n---\n\n<!-- Page {i} -->\n\n{t}")
    return "".join(out).lstrip()


def _save_pil_to_temp_png(img: Image.Image, tmpdir: Path, page_index: int) -> Path:
    p = tmpdir / f"page_{page_index:05d}.png"
    img.save(p, format="PNG")
    return p


#  Nougat

In [None]:
!pip install nougat-ocr


In [None]:
!pip install albumentations==1.0.0

In [None]:
!pip install "transformers>=4.25.1,<=4.38.2" "pypdfium2<5" "timm==0.5.4" "lightning>=2.0.0,<2022" "sconf>=0.2.3"  "pypdf>=3.1.0",

In [None]:
!nougat pdfs -m 0.1.0-base -o nougat_base


In [None]:
!zip -r -qq nougat_base.zip nougat_base

In [None]:
from pathlib import Path
from typing import List
from tqdm import tqdm
import torch

from transformers import pipeline

device = 0 if torch.cuda.is_available() else -1

nougat_pipeline = pipeline(
    "image-to-text",
    model="facebook/nougat-base",
    device=device,
    max_new_tokens=4000,
    trust_remote_code=True,
)

def pdf_to_md_nougat(
    pdf_path: Path,
    scale: float = 2.0,
) -> str:
    """
    Nougat: PDF -> Markdown (joined)
    Uses HF pipeline("image-to-text") as in the HF Space reference.
    """
    pages = convert_pdf_to_images(pdf_path, scale=scale)
    if not pages:
        return ""

    page_texts: List[str] = []
    for img in tqdm(pages, desc="Nougat pages"):
        img = img.convert("RGB")
        out = nougat_pipeline(img)
        md = out[0].get("generated_text", "") if out else ""
        page_texts.append(md.strip())

    return _join_pages_as_md(page_texts)



In [None]:
output_path = Path("nougat")
output_path.mkdir(parents=True, exist_ok=True)
for pdf_path in Path('pdfs').glob('*.pdf'):
    md_result = pdf_to_md_nougat(
        pdf_path,
    )
    with open(output_path / pdf_path.with_suffix('.md').name, 'w') as f:
        f.write(md_result)


In [None]:
!zip -r -qq nougat.zip nougat

In [None]:
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText


model = AutoModelForImageTextToText.from_pretrained(
    "nanonets/Nanonets-OCR2-3B",
    torch_dtype="auto",
    device_map="auto",
).eval()
_ = AutoTokenizer.from_pretrained("nanonets/Nanonets-OCR2-3B")  # not always needed, but in your ref code
processor = AutoProcessor.from_pretrained("nanonets/Nanonets-OCR2-3B")


def pdf_to_md_nanonets_ocr2(
    pdf_path: Path,
    scale: float = 1.0,
    max_new_tokens: int = 5000,
) -> str:
    pages = convert_pdf_to_images(pdf_path, scale=scale)
    if not pages:
        return ""

    prompt = (
        "Extract the text from the above document as if you were reading it naturally. "
        "Return the tables in html format. Return the equations in LaTeX representation. "
        "If there is an image in the document and image caption is not present, add a small "
        "description of the image inside the <img></img> tag; otherwise, add the image caption "
        "inside <img></img>. Watermarks should be wrapped in brackets. "
        "Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. "
        "Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. "
        "Prefer using ? and ? for check boxes."
    )

    page_texts: List[str] = []
    for img in tqdm(pages, desc="Nanonets-OCR2 pages"):
        img = img.convert("RGB")
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text", "text": prompt},
                ],
            },
        ]
        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = processor(text=[text], images=[img], padding=True, return_tensors="pt")
        inputs = inputs.to(model.device)

        with torch.no_grad():
            output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)

        # Keep only generated continuation
        generated_ids = [
            out[len(inp):] for inp, out in zip(inputs.input_ids, output_ids)
        ]
        out_text = processor.batch_decode(
            generated_ids,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )[0]

        page_texts.append(out_text.strip())

    return _join_pages_as_md(page_texts)


In [None]:
output_path = Path("nanonets")
output_path.mkdir(parents=True, exist_ok=True)
for pdf_path in Path('pdfs').glob('*.pdf'):
    md_result = pdf_to_md_nanonets_ocr2(
        pdf_path,
        scale=1
    )
    with open(output_path / pdf_path.with_suffix('.md').name, 'w') as f:
        f.write(md_result)


In [None]:
!zip -r -qq nanonets.zip nanonets

# Dots.OCR

In [None]:
!git clone https://github.com/rednote-hilab/dots.ocr.git
%cd dots.ocr


In [None]:
!python tools/download_model.py

In [None]:
!pip install transformers==4.51.3 qwen_vl_utils==0.0.11

In [None]:
from transformers import AutoModelForCausalLM, AutoProcessor
from qwen_vl_utils import process_vision_info

dtype = _ensure_bf16_supported_or_fallback(torch.bfloat16)

model = AutoModelForCausalLM.from_pretrained(
    "./weights/DotsOCR",
    torch_dtype=dtype,
    device_map="auto",
    trust_remote_code=True,
).eval()
processor = AutoProcessor.from_pretrained("./weights/DotsOCR", trust_remote_code=True)


def pdf_to_md_dots_ocr(
    pdf_path: Path,
    scale: float = 1.0,
    max_new_tokens: int = 4000,
    prompt: Optional[str] = None,
) -> str:
    """
    Dots.OCR is usually for layout JSON; we keep your reference prompt default.
    You can replace 'prompt' with a markdown-focused one if desired.
    """
    if "LOCAL_RANK" not in os.environ:
        os.environ["LOCAL_RANK"] = "0"



    pages = convert_pdf_to_images(pdf_path, scale=scale)
    if not pages:
        return ""

    if prompt is None:
        prompt = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.

1. Bbox format: [x1, y1, x2, y2]

2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].

3. Text Extraction & Formatting Rules:
    - Picture: For the 'Picture' category, the text field should be omitted.
    - Formula: Format its text as LaTeX.
    - Table: Format its text as HTML.
    - All Others (Text, Title, etc.): Format their text as Markdown.

4. Constraints:
    - The output text must be the original text from the image, with no translation.
    - All layout elements must be sorted according to human reading order.

5. Final Output: The entire output must be a single JSON object.
"""
    page_texts: List[str] = []

    # Dots example passes image path; we'll write temp images per page.
    with tempfile.TemporaryDirectory() as td:
        tmpdir = Path(td)
        for i, img in enumerate(tqdm(pages, desc="Dots.OCR pages"), start=1):
            img_path = _save_pil_to_temp_png(img.convert("RGB"), tmpdir, i)

            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": str(img_path)},
                        {"type": "text", "text": prompt},
                    ],
                }
            ]

            text = processor.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,
            )
            image_inputs, video_inputs = process_vision_info(messages)
            inputs = processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors="pt",
            )

            # device_map="auto" -> model may span GPUs; move inputs to first device
            # A common convention is model.device for single device; for sharded models,
            # processor inputs on cuda:0 usually works.
            inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")

            with torch.no_grad():
                generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)

            generated_ids_trimmed = [
                out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]
            output_text = processor.batch_decode(
                generated_ids_trimmed,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False,
            )[0]

            page_texts.append(output_text.strip())

    return _join_pages_as_md(page_texts)



In [None]:
output_path = Path("dots_ocr")
output_path.mkdir(parents=True, exist_ok=True)
for pdf_path in Path('../pdfs').glob('*.pdf'):
    md_result = pdf_to_md_dots_ocr(
        pdf_path,
        scale=1
    )
    with open(output_path / pdf_path.with_suffix('.md').name, 'w') as f:
        f.write(md_result)


# DeepSeek

In [None]:
!pip install transformers==4.46.3 tokenizers==0.20.3 PyMuPDF img2pdf einops easydict addict

In [None]:
import sys
from io import StringIO
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-OCR", trust_remote_code=True)
model = AutoModel.from_pretrained(
    "deepseek-ai/DeepSeek-OCR",
    trust_remote_code=True,
    use_safetensors=True,
)
model = model.eval().cuda().to(torch.bfloat16) if torch.cuda.is_available() else model.eval()


def pdf_to_md_deepseek_ocr(
    pdf_path: Path,
    scale: float = 1.0,
    prompt: str = "<image>\n<|grounding|>Convert the document to markdown. ",
    base_size: int = 1024,
    image_size: int = 640,
    crop_mode: bool = True,
    save_results: bool = False,
    test_compress: bool = True,
) -> str:
    """
    Uses DeepSeek-OCR's model.infer(...). That method typically writes outputs to disk.
    We'll run per page and capture the returned value if provided; otherwise, we read from output dir.

    If your DeepSeek-OCR returns a dict/string directly, this works.
    If it *only* writes files, this reads the likely markdown/text file if present.
    """
    pages = convert_pdf_to_images(pdf_path, scale=scale)
    if not pages:
        return ""

    page_texts: List[str] = []

    with tempfile.TemporaryDirectory() as td:
        tmpdir = Path(td)
        outdir = tmpdir / "deepseek_outputs"
        outdir.mkdir(parents=True, exist_ok=True)

        for i, img in enumerate(tqdm(pages, desc="DeepSeek-OCR pages"), start=1):
            img_path = _save_pil_to_temp_png(img.convert("RGB"), tmpdir, i)
            page_outdir = outdir / f"page_{i:05d}"
            page_outdir.mkdir(parents=True, exist_ok=True)

            stdout = sys.stdout
            sys.stdout = StringIO()

            res = model.infer(
                tokenizer,
                prompt=prompt,
                image_file=str(img_path),
                output_path=str(page_outdir),
                base_size=base_size,
                image_size=image_size,
                crop_mode=crop_mode,
                save_results=save_results,
                test_compress=test_compress,
            )
            print(res)

            result = '\n'.join([l for l in sys.stdout.getvalue().split('\n')
                                if not any(s in l for s in ['image:', 'other:', 'PATCHES', '====', 'BASE:', '%|', 'torch.Size'])]).strip()
            sys.stdout = stdout

            page_texts.append(result)

    return _join_pages_as_md(page_texts), res


In [None]:
output_path = Path("deepseek_ocr")
output_path.mkdir(parents=True, exist_ok=True)
for pdf_path in Path('./pdfs').glob('*.pdf'):
    md_result, last_res = pdf_to_md_deepseek_ocr(
        pdf_path,
        scale=1
    )
    with open(output_path / pdf_path.with_suffix('.md').name, 'w') as f:
        f.write(md_result)


In [None]:
!zip -r -qq deepseek_ocr.zip deepseek_ocr

# Dolphin

In [None]:
!git clone https://github.com/ByteDance/Dolphin.git
%cd Dolphin
!pip install -r requirements.txt


In [None]:
!huggingface-cli download ByteDance/Dolphin-v2 --local-dir ./hf_model


In [None]:
ls ./hf_model

In [None]:
!python demo_page.py --model_path ./hf_model --save_dir ./dolphin_results --input_path ../pdfs


In [None]:
!zip -r -qq dolphin_results.zip dolphin_results