# Nemotron-Parse Invoice Processing

> **WARNING: This notebook requires 24+ GB VRAM and will NOT run on Kaggle's free GPU tier (P100 16GB).**
>
> The model's ViT-H vision encoder requires ~10.5 GB for attention computation at the fixed 2048x1664 input resolution. This cannot be reduced without retraining the model. See `docs/nemotron_parse.md` for full details on workarounds attempted.
>
> **Recommended GPUs:** NVIDIA A100 (40GB+), H100 (80GB), A10 (24GB), or RTX 4090 (24GB)

This notebook processes invoice images using NVIDIA Nemotron-Parse v1.1 (885M) with GPU acceleration.
Uses the HuggingFace Transformers API with the `nvidia/NVIDIA-Nemotron-Parse-v1.1` model.

In [None]:
# Cell 1: Install dependencies
# Nemotron-Parse requires pinned transformers==4.51.3 (issues with 4.53+)
# open_clip_torch is required by the C-RADIO vision encoder
# pypdfium2 is needed for PDF-to-image conversion (model works on images only)
!pip install transformers==4.51.3 --quiet
!pip install accelerate==1.12.0 albumentations==2.0.8 timm==1.0.22 --quiet
!pip install open_clip_torch --quiet
!pip install pypdfium2 Pillow --quiet

In [None]:
# Cell 2: Verify GPU
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

In [None]:
# Cell 3: Setup and initialize Nemotron-Parse
import os
import json
import time
from pathlib import Path
from transformers import AutoModel, AutoProcessor, AutoTokenizer, GenerationConfig

# Reduce VRAM fragmentation on T4 GPU
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Configuration
INPUT_DIR = Path("/kaggle/input/synthetic-invoices")
OUTPUT_DIR = Path("/kaggle/working/nemotron-parse")
OUTPUT_DIR.mkdir(exist_ok=True)
(OUTPUT_DIR / "raw").mkdir(exist_ok=True)

MODEL_NAME = "nvidia/NVIDIA-Nemotron-Parse-v1.1"
DEVICE = "cuda:0"

# Initialize Nemotron-Parse
# - trust_remote_code=True: required for custom model architecture
# - BF16 precision: optimal for this model
print("Loading Nemotron-Parse model...")
model = AutoModel.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16
).to(DEVICE).eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)
generation_config = GenerationConfig.from_pretrained(MODEL_NAME, trust_remote_code=True)

print(f"Nemotron-Parse loaded on {DEVICE}")

In [None]:
# Cell 4: Define processing function
import pypdfium2 as pdfium
import tempfile
from PIL import Image

def pdf_to_image_path(pdf_path):
    """Convert first page of a PDF to a temporary PNG image. Returns the temp file path."""
    pdf = pdfium.PdfDocument(str(pdf_path))
    page = pdf[0]
    bitmap = page.render(scale=2)  # Higher scale for better OCR quality
    pil_image = bitmap.to_pil()
    tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
    tmp_path = tmp.name
    tmp.close()
    pil_image.save(tmp_path)
    return tmp_path

def process_image(image_path, model, processor, generation_config):
    """Process a single image or PDF with Nemotron-Parse."""
    start_time = time.time()
    temp_file = None

    try:
        # Convert PDF to image if needed
        if str(image_path).lower().endswith(".pdf"):
            temp_file = pdf_to_image_path(image_path)
            image = Image.open(temp_file)
        else:
            image = Image.open(str(image_path))

        # Task prompt for markdown output with bboxes and classes
        task_prompt = "</s><s><predict_bbox><predict_classes><output_markdown>"

        # Process image
        inputs = processor(
            images=[image],
            text=task_prompt,
            return_tensors="pt",
            add_special_tokens=False
        ).to(DEVICE)

        # Generate text
        with torch.no_grad():
            outputs = model.generate(**inputs, generation_config=generation_config)

        # Decode the generated text
        generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]

        processing_time = time.time() - start_time

        return {
            "success": True,
            "raw_text": generated_text,
            "raw_data": [{"text": generated_text}] if generated_text else [],
            "processing_time_seconds": round(processing_time, 3)
        }
    except Exception as e:
        import traceback
        return {
            "success": False,
            "error": f"{str(e)}\n{traceback.format_exc()}",
            "raw_text": "",
            "raw_data": [],
            "processing_time_seconds": time.time() - start_time
        }
    finally:
        # Free GPU memory between images to prevent OOM
        torch.cuda.empty_cache()
        # Clean up temp PDF-converted image
        if temp_file and os.path.exists(temp_file):
            os.unlink(temp_file)

In [None]:
# Cell 5: Test on a single image before batch processing
image_files = sorted(
    list(INPUT_DIR.glob("*.png")) +
    list(INPUT_DIR.glob("*.jpeg")) +
    list(INPUT_DIR.glob("*.pdf"))
)
print(f"Found {len(image_files)} files total")

if image_files:
    test_img = image_files[0]
    print(f"\nTesting on: {test_img.name}")

    test_result = process_image(test_img, model, processor, generation_config)
    print(f"Success: {test_result['success']}")
    print(f"Processing time: {test_result['processing_time_seconds']:.2f}s")

    if test_result["raw_text"]:
        preview = test_result["raw_text"][:500]
        print(f"Text preview ({len(test_result['raw_text'])} chars):\n{preview}")
        print("\nOCR pipeline is working correctly!")
    else:
        print("\nWARNING: No text extracted!")
        if test_result.get("error"):
            print(f"Error: {test_result['error']}")
else:
    print("ERROR: No images found in input directory!")

In [None]:
# Cell 6: Process all images
print(f"Processing {len(image_files)} files...")

empty_count = 0
results = []
for i, image_path in enumerate(image_files):
    print(f"Processing {i+1}/{len(image_files)}: {image_path.name}", end="")

    result = process_image(image_path, model, processor, generation_config)

    if not result.get("raw_text"):
        empty_count += 1
        print(" - WARNING: no text extracted!", end="")

    print()

    # Create output structure
    output = {
        "filename": image_path.name,
        "model_name": "nemotron-parse",
        "raw_text": result.get("raw_text", ""),
        "processing_time_seconds": result.get("processing_time_seconds", 0),
        "file_size_bytes": image_path.stat().st_size,
        "success": result.get("success", False)
    }

    if not result["success"]:
        output["error"] = result.get("error", "Unknown error")

    results.append(output)

    # Save individual result
    result_file = OUTPUT_DIR / f"{image_path.stem}_{image_path.suffix.lstrip('.')}.json"
    with open(result_file, "w") as f:
        json.dump(output, f, indent=2)

    # Save raw response
    raw_file = OUTPUT_DIR / "raw" / f"{image_path.stem}_{image_path.suffix.lstrip('.')}_raw.json"
    with open(raw_file, "w") as f:
        json.dump({
            "content": result.get("raw_text", ""),
            "detections": result.get("raw_data", [])
        }, f, indent=2)

print(f"\nCompleted! Processed {len(results)} files")

In [None]:
# Cell 7: Save combined results and summary
all_results_file = OUTPUT_DIR / "all_results.json"
with open(all_results_file, "w") as f:
    json.dump(results, f, indent=2)

# Print summary
successful = sum(1 for r in results if r.get("success", False))
with_text = sum(1 for r in results if r.get("raw_text", ""))
total_time = sum(r.get("processing_time_seconds", 0) for r in results)
print(f"\nSummary:")
print(f"  Total files: {len(results)}")
print(f"  Successful: {successful}")
print(f"  With text extracted: {with_text}")
print(f"  Empty results: {empty_count}")
print(f"  Failed: {len(results) - successful}")
print(f"  Total processing time: {total_time:.1f}s")
print(f"  Average time per file: {total_time/len(results):.2f}s")
print(f"\nResults saved to: {OUTPUT_DIR}")

In [None]:
# Cell 8: Create downloadable ZIP
import shutil

shutil.make_archive(
    "/kaggle/working/nemotron_parse",
    'zip',
    OUTPUT_DIR
)
print("ZIP created: /kaggle/working/nemotron_parse.zip")
print("Download from the Output tab after notebook completes.")