# Document Ingestion with MinerU

This notebook explores document ingestion using the MinerU library for advanced PDF parsing and content extraction.

In [None]:
# Install dependencies if needed
# !pip install mineru[all]
# Or using uv: !uv pip install -U "mineru[all]"

In [None]:
from mineru import parse_doc, do_parse
from pathlib import Path
import os
import json

In [None]:
# Set the PDF path
pdf_path = "../docs/DocLayNet.pdf"
print(f"Processing: {pdf_path}")
print(f"File exists: {os.path.exists(pdf_path)}")

# Create output directory
output_dir = "mineru_output"
os.makedirs(output_dir, exist_ok=True)
print(f"Output directory: {output_dir}")

In [None]:
# Parse document using parse_doc (simple API)
doc_paths = [Path(pdf_path)]
parse_doc(
    path_list=doc_paths,
    output_dir=output_dir,
    lang="en",  # OCR language
    backend="hybrid-auto-engine",  # High accuracy backend
    method="auto",  # Auto-detect parsing method
    start_page_id=0,
    end_page_id=5  # Limit to first 5 pages for demo
)

print("Document parsing completed!")

In [None]:
# List the generated output files
output_files = os.listdir(output_dir)
print("Generated output files:")
for file in output_files:
    print(f"  - {file}")

In [None]:
# Read the main Markdown output
md_file = None
for file in output_files:
    if file.endswith('.md'):
        md_file = file
        break

if md_file:
    md_path = os.path.join(output_dir, md_file)
    print(f"Reading Markdown file: {md_file}")
    with open(md_path, 'r', encoding='utf-8') as f:
        markdown_content = f.read()
    
    print("First 1000 characters of Markdown content:")
    print("=" * 50)
    print(markdown_content[:1000])
    print("=" * 50)
else:
    print("No Markdown file found!")

In [None]:
# Read the content list JSON (simplified structured data)
content_list_file = None
for file in output_files:
    if file.endswith('_content_list.json'):
        content_list_file = file
        break

if content_list_file:
    json_path = os.path.join(output_dir, content_list_file)
    print(f"Reading content list: {content_list_file}")
    with open(json_path, 'r', encoding='utf-8') as f:
        content_data = json.load(f)
    
    print(f"Content list contains {len(content_data)} items")
    print("First few items:")
    for i, item in enumerate(content_data[:5]):
        print(f"{i+1}. Type: {item.get('type', 'unknown')}")
        print(f"   Text: {item.get('text', '')[:100]}...")
        print()
else:
    print("No content list JSON found!")

## MinerU Output Formats

MinerU generates several output files:

- **{filename}.md**: Main Markdown file with extracted text, images, tables, and equations
- **{filename}_content_list.json**: Simplified structured content list
- **{filename}_middle.json**: Detailed intermediate parsing results
- **{filename}_layout.pdf**: Visualization of detected layout blocks
- **{filename}_span.pdf**: Text span visualization (pipeline backend)
- **images/**: Directory containing extracted images
- **tables/**: Directory containing extracted tables

### Content Types in MinerU

MinerU extracts different types of content:

- **text**: Regular text paragraphs
- **title**: Document titles and headings
- **image**: Images with captions
- **table**: Tables in HTML format
- **equation**: Mathematical equations in LaTeX
- **header**: Page headers
- **footer**: Page footers
- **figure**: Figures and diagrams

In [None]:
# Analyze content types in the content list
if content_list_file:
    content_types = {}
    for item in content_data:
        content_type = item.get('type', 'unknown')
        content_types[content_type] = content_types.get(content_type, 0) + 1
    
    print("Content type distribution:")
    for content_type, count in content_types.items():
        print(f"  {content_type}: {count}")
    
    # Show sample of each type
    print("\nSample content by type:")
    for content_type in content_types.keys():
        samples = [item for item in content_data if item.get('type') == content_type][:2]
        if samples:
            print(f"\n{content_type.upper()}:")
            for sample in samples:
                text = sample.get('text', '')
                print(f"  {text[:200]}{'...' if len(text) > 200 else ''}")

In [None]:
# Extract and display images
images_dir = os.path.join(output_dir, "images")
if os.path.exists(images_dir):
    image_files = os.listdir(images_dir)
    print(f"Extracted {len(image_files)} images:")
    for img_file in image_files[:5]:  # Show first 5
        print(f"  - {img_file}")
        
    # Display image metadata from content list
    image_items = [item for item in content_data if item.get('type') == 'image']
    if image_items:
        print(f"\nImage metadata (first {min(3, len(image_items))}):")
        for i, img_item in enumerate(image_items[:3]):
            print(f"Image {i+1}:")
            print(f"  Text: {img_item.get('text', '')}")
            print(f"  Position: {img_item.get('bbox', 'N/A')}")
else:
    print("No images directory found")

In [None]:
# Extract and display tables
tables_dir = os.path.join(output_dir, "tables")
if os.path.exists(tables_dir):
    table_files = os.listdir(tables_dir)
    print(f"Extracted {len(table_files)} tables:")
    for table_file in table_files[:3]:  # Show first 3
        print(f"  - {table_file}")
        
    # Display table content from content list
    table_items = [item for item in content_data if item.get('type') == 'table']
    if table_items:
        print(f"\nTable content (first {min(2, len(table_items))}):")
        for i, table_item in enumerate(table_items[:2]):
            print(f"Table {i+1}:")
            html_content = table_item.get('text', '')
            print(f"  HTML length: {len(html_content)} characters")
            print(f"  Preview: {html_content[:300]}{'...' if len(html_content) > 300 else ''}")
else:
    print("No tables directory found")

## Advanced Usage: do_parse API

For more control, you can use the `do_parse` function directly with PDF bytes.

In [None]:
# Advanced usage with do_parse
pdf_bytes_list = [open(pdf_path, "rb").read()]
file_names = [Path(pdf_path).stem]
lang_list = ["en"]

# Parse with do_parse for more control
do_parse(
    output_dir=output_dir,
    pdf_file_names=file_names,
    pdf_bytes_list=pdf_bytes_list,
    p_lang_list=lang_list,
    backend="hybrid-auto-engine",
    parse_method="auto",
    formula_enable=True,
    table_enable=True,
    f_dump_md=True,
    f_dump_content_list=True,
    f_dump_middle_json=True,
    start_page_id=0,
    end_page_id=2  # Only first 3 pages
)

print("Advanced parsing with do_parse completed!")

In [None]:
# Compare outputs
print("Files after advanced parsing:")
updated_files = os.listdir(output_dir)
for file in sorted(updated_files):
    print(f"  - {file}")

## Summary

MinerU provides comprehensive document parsing capabilities:

- **High Accuracy**: Uses advanced ML models for layout analysis
- **Multimodal Output**: Extracts text, images, tables, and equations
- **Structured Data**: Provides both Markdown and JSON outputs
- **OCR Support**: Handles scanned documents
- **Flexible APIs**: Both simple and advanced parsing options

Key advantages over basic PDF parsers:
- Better handling of complex layouts
- Equation extraction in LaTeX format
- Table structure preservation
- Image extraction with context
- Reading order preservation