# NuMarkdown-8B-Thinking Invoice Processing

> **WARNING: This notebook requires 16+ GB VRAM and may not run on Kaggle's free GPU tier (P100 16GB).**
>
> The 8B parameter model with BF16 precision requires significant VRAM. If you encounter OOM errors, consider:
> - Using a GPU with more VRAM (A100, H100, RTX 4090)
> - Using a quantized version of the model
>
> **Recommended GPUs:** NVIDIA A100 (40GB+), H100 (80GB), A10 (24GB), or RTX 4090 (24GB)

This notebook processes invoice images using NuMind AI's NuMarkdown-8B-Thinking model.

**Model:** [numind/NuMarkdown-8B-Thinking](https://huggingface.co/numind/NuMarkdown-8B-Thinking)

**Key Features:**
- First reasoning-based OCR VLM
- Generates `<think>` tags to analyze document layout before producing markdown
- Particularly good at complex layouts and tables
- Fine-tuned from Qwen 2.5-VL-7B with RL (GRPO) training

In [None]:
# Cell 1: Install dependencies
# NuMarkdown-8B-Thinking is based on Qwen 2.5-VL and requires recent transformers
# qwen-vl-utils provides utilities for Qwen VL models
# pypdfium2 is needed for PDF-to-image conversion (model works on images only)
!pip install transformers>=4.45.0 accelerate --quiet
!pip install qwen-vl-utils --quiet
!pip install pypdfium2 Pillow --quiet

In [None]:
# Cell 2: Verify GPU
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

In [None]:
# Cell 3: Setup and initialize NuMarkdown-8B-Thinking
import os
import json
import time
from pathlib import Path
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

# Reduce VRAM fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Configuration
INPUT_DIR = Path("/kaggle/input/synthetic-invoices")
OUTPUT_DIR = Path("/kaggle/working/numarkdown-8b-thinking")
OUTPUT_DIR.mkdir(exist_ok=True)
(OUTPUT_DIR / "raw").mkdir(exist_ok=True)

MODEL_NAME = "numind/NuMarkdown-8B-Thinking"
DEVICE = "cuda:0"

# Initialize NuMarkdown-8B-Thinking
# - Uses Qwen2_5_VLForConditionalGeneration (based on Qwen 2.5-VL-7B)
# - flash_attention_2 for efficient attention computation
# - BF16 precision for optimal memory/quality tradeoff
print("Loading NuMarkdown-8B-Thinking model...")

processor = AutoProcessor.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    min_pixels=100*28*28,
    max_pixels=5000*28*28
)

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
    trust_remote_code=True
)

print(f"NuMarkdown-8B-Thinking loaded on {DEVICE}")
print(f"Model dtype: {model.dtype}")

In [None]:
# Cell 4: Define processing function
import pypdfium2 as pdfium
import tempfile
from PIL import Image

def pdf_to_image_path(pdf_path):
    """Convert first page of a PDF to a temporary PNG image. Returns the temp file path."""
    pdf = pdfium.PdfDocument(str(pdf_path))
    page = pdf[0]
    bitmap = page.render(scale=2)  # Higher scale for better OCR quality
    pil_image = bitmap.to_pil()
    tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
    tmp_path = tmp.name
    tmp.close()
    pil_image.save(tmp_path)
    return tmp_path

def parse_output(result_text):
    """Parse the model output to extract reasoning and answer sections."""
    reasoning = ""
    answer = ""
    
    # Extract <think>...</think> section
    if "<think>" in result_text and "</think>" in result_text:
        try:
            reasoning = result_text.split("<think>")[1].split("</think>")[0].strip()
        except IndexError:
            pass
    
    # Extract <answer>...</answer> section
    if "<answer>" in result_text and "</answer>" in result_text:
        try:
            answer = result_text.split("<answer>")[1].split("</answer>")[0].strip()
        except IndexError:
            pass
    
    # If no answer tags found, use the full text (minus thinking)
    if not answer:
        if "</think>" in result_text:
            answer = result_text.split("</think>")[-1].strip()
        else:
            answer = result_text
    
    return reasoning, answer

def process_image(image_path, model, processor):
    """Process a single image or PDF with NuMarkdown-8B-Thinking."""
    start_time = time.time()
    temp_file = None

    try:
        # Convert PDF to image if needed
        if str(image_path).lower().endswith(".pdf"):
            temp_file = pdf_to_image_path(image_path)
            image = Image.open(temp_file).convert("RGB")
        else:
            image = Image.open(str(image_path)).convert("RGB")

        # Build chat-format message (NuMarkdown uses Qwen's chat template)
        messages = [{
            "role": "user",
            "content": [
                {"type": "image"},
            ],
        }]
        
        # Apply chat template
        prompt = processor.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )
        
        # Process image and text
        model_input = processor(
            text=prompt,
            images=[image],
            return_tensors="pt"
        ).to(model.device)

        # Generate output with reasoning
        with torch.no_grad():
            output_ids = model.generate(
                **model_input,
                temperature=0.7,
                max_new_tokens=5000
            )

        # Decode the generated text
        generated_text = processor.decode(output_ids[0], skip_special_tokens=True)
        
        # Parse reasoning and answer from the output
        reasoning, answer = parse_output(generated_text)

        processing_time = time.time() - start_time

        return {
            "success": True,
            "raw_text": generated_text,
            "reasoning": reasoning,
            "answer": answer,
            "raw_data": [{"text": generated_text, "reasoning": reasoning, "answer": answer}],
            "processing_time_seconds": round(processing_time, 3)
        }
    except Exception as e:
        import traceback
        return {
            "success": False,
            "error": f"{str(e)}\n{traceback.format_exc()}",
            "raw_text": "",
            "reasoning": "",
            "answer": "",
            "raw_data": [],
            "processing_time_seconds": time.time() - start_time
        }
    finally:
        # Free GPU memory between images to prevent OOM
        torch.cuda.empty_cache()
        # Clean up temp PDF-converted image
        if temp_file and os.path.exists(temp_file):
            os.unlink(temp_file)

In [None]:
# Cell 5: Test on a single image before batch processing
image_files = sorted(
    list(INPUT_DIR.glob("*.png")) +
    list(INPUT_DIR.glob("*.jpeg")) +
    list(INPUT_DIR.glob("*.pdf"))
)
print(f"Found {len(image_files)} files total")

if image_files:
    test_img = image_files[0]
    print(f"\nTesting on: {test_img.name}")

    test_result = process_image(test_img, model, processor)
    print(f"Success: {test_result['success']}")
    print(f"Processing time: {test_result['processing_time_seconds']:.2f}s")

    if test_result["answer"]:
        preview = test_result["answer"][:500]
        print(f"\nAnswer preview ({len(test_result['answer'])} chars):\n{preview}")
        if test_result["reasoning"]:
            print(f"\nReasoning preview ({len(test_result['reasoning'])} chars):\n{test_result['reasoning'][:300]}...")
        print("\nOCR pipeline is working correctly!")
    else:
        print("\nWARNING: No answer extracted!")
        if test_result.get("error"):
            print(f"Error: {test_result['error']}")
else:
    print("ERROR: No images found in input directory!")

In [None]:
# Cell 6: Process all images
print(f"Processing {len(image_files)} files...")

empty_count = 0
results = []
for i, image_path in enumerate(image_files):
    print(f"Processing {i+1}/{len(image_files)}: {image_path.name}", end="")

    result = process_image(image_path, model, processor)

    if not result.get("answer"):
        empty_count += 1
        print(" - WARNING: no answer extracted!", end="")

    print()

    # Create output structure
    output = {
        "filename": image_path.name,
        "model_name": "numarkdown-8b-thinking",
        "raw_text": result.get("raw_text", ""),
        "reasoning": result.get("reasoning", ""),
        "answer": result.get("answer", ""),
        "processing_time_seconds": result.get("processing_time_seconds", 0),
        "file_size_bytes": image_path.stat().st_size,
        "success": result.get("success", False)
    }

    if not result["success"]:
        output["error"] = result.get("error", "Unknown error")

    results.append(output)

    # Save individual result
    result_file = OUTPUT_DIR / f"{image_path.stem}_{image_path.suffix.lstrip('.')}.json"
    with open(result_file, "w") as f:
        json.dump(output, f, indent=2)

    # Save raw response
    raw_file = OUTPUT_DIR / "raw" / f"{image_path.stem}_{image_path.suffix.lstrip('.')}_raw.json"
    with open(raw_file, "w") as f:
        json.dump({
            "content": result.get("raw_text", ""),
            "reasoning": result.get("reasoning", ""),
            "answer": result.get("answer", ""),
            "detections": result.get("raw_data", [])
        }, f, indent=2)

print(f"\nCompleted! Processed {len(results)} files")

In [None]:
# Cell 7: Save combined results and summary
all_results_file = OUTPUT_DIR / "all_results.json"
with open(all_results_file, "w") as f:
    json.dump(results, f, indent=2)

# Print summary
successful = sum(1 for r in results if r.get("success", False))
with_answer = sum(1 for r in results if r.get("answer", ""))
with_reasoning = sum(1 for r in results if r.get("reasoning", ""))
total_time = sum(r.get("processing_time_seconds", 0) for r in results)
print(f"\nSummary:")
print(f"  Total files: {len(results)}")
print(f"  Successful: {successful}")
print(f"  With answer extracted: {with_answer}")
print(f"  With reasoning trace: {with_reasoning}")
print(f"  Empty results: {empty_count}")
print(f"  Failed: {len(results) - successful}")
print(f"  Total processing time: {total_time:.1f}s")
print(f"  Average time per file: {total_time/len(results):.2f}s")
print(f"\nResults saved to: {OUTPUT_DIR}")

In [None]:
# Cell 8: Create downloadable ZIP
import shutil

shutil.make_archive(
    "/kaggle/working/numarkdown_8b_thinking",
    'zip',
    OUTPUT_DIR
)
print("ZIP created: /kaggle/working/numarkdown_8b_thinking.zip")
print("Download from the Output tab after notebook completes.")