# GLM-OCR Invoice Processing

This notebook processes invoice images using GLM-OCR (0.9B) with GPU acceleration on Kaggle.
Uses the HuggingFace Transformers API with the `zai-org/GLM-OCR` model.

In [None]:
# Cell 1: Install dependencies
# GLM-OCR requires transformers from GitHub (not PyPI release)
# pypdfium2 is needed for PDF-to-image conversion (GLM-OCR works on images only)
!pip install torch torchvision --quiet
!pip install git+https://github.com/huggingface/transformers.git --quiet
!pip install accelerate sentencepiece pypdfium2 --quiet

In [None]:
# Cell 2: Verify GPU
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

In [None]:
# Cell 3: Setup and initialize GLM-OCR
import os
import json
import time
from pathlib import Path
from transformers import AutoProcessor, AutoModelForImageTextToText

# Reduce VRAM fragmentation on T4 GPU
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Configuration
INPUT_DIR = Path("/kaggle/input/synthetic-invoices")
OUTPUT_DIR = Path("/kaggle/working/glm-ocr")
OUTPUT_DIR.mkdir(exist_ok=True)
(OUTPUT_DIR / "raw").mkdir(exist_ok=True)

MODEL_NAME = "zai-org/GLM-OCR"

# Initialize GLM-OCR
print("Loading GLM-OCR model...")
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModelForImageTextToText.from_pretrained(
    MODEL_NAME,
    torch_dtype="auto",
    device_map="auto"
)
print(f"GLM-OCR loaded on {model.device}")

In [None]:
# Cell 4: Define processing function
import pypdfium2 as pdfium
import tempfile

def pdf_to_image_path(pdf_path):
    """Convert first page of a PDF to a temporary PNG image. Returns the temp file path."""
    pdf = pdfium.PdfDocument(str(pdf_path))
    page = pdf[0]
    bitmap = page.render(scale=1)
    pil_image = bitmap.to_pil()
    tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
    tmp_path = tmp.name
    tmp.close()
    pil_image.save(tmp_path)
    return tmp_path

def process_image(image_path, model, processor):
    """Process a single image or PDF with GLM-OCR."""
    start_time = time.time()
    temp_file = None

    try:
        # Convert PDF to image if needed
        if str(image_path).lower().endswith(".pdf"):
            temp_file = pdf_to_image_path(image_path)
            img_url = temp_file
        else:
            img_url = str(image_path)

        # Build chat message with image and OCR prompt
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": img_url},
                    {"type": "text", "text": "Text Recognition:"}
                ],
            }
        ]

        inputs = processor.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt"
        ).to(model.device)

        inputs.pop("token_type_ids", None)

        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_new_tokens=8192)

        output_text = processor.decode(
            generated_ids[0][inputs["input_ids"].shape[1]:],
            skip_special_tokens=True
        ).strip()

        processing_time = time.time() - start_time

        return {
            "success": True,
            "raw_text": output_text,
            "raw_data": [{"text": output_text}] if output_text else [],
            "processing_time_seconds": round(processing_time, 3)
        }
    except Exception as e:
        import traceback
        return {
            "success": False,
            "error": f"{str(e)}\n{traceback.format_exc()}",
            "raw_text": "",
            "raw_data": [],
            "processing_time_seconds": time.time() - start_time
        }
    finally:
        # Free GPU memory between images to prevent OOM on PDFs
        torch.cuda.empty_cache()
        # Clean up temp PDF-converted image
        if temp_file and os.path.exists(temp_file):
            os.unlink(temp_file)

In [None]:
# Cell 5: Test on a single image before batch processing
image_files = sorted(
    list(INPUT_DIR.glob("*.png")) +
    list(INPUT_DIR.glob("*.jpeg")) +
    list(INPUT_DIR.glob("*.pdf"))
)
print(f"Found {len(image_files)} files total")

if image_files:
    test_img = image_files[0]
    print(f"\nTesting on: {test_img.name}")

    test_result = process_image(test_img, model, processor)
    print(f"Success: {test_result['success']}")
    print(f"Processing time: {test_result['processing_time_seconds']:.2f}s")

    if test_result["raw_text"]:
        preview = test_result["raw_text"][:500]
        print(f"Text preview ({len(test_result['raw_text'])} chars):\n{preview}")
        print("\nOCR pipeline is working correctly!")
    else:
        print("\nWARNING: No text extracted!")
        if test_result.get("error"):
            print(f"Error: {test_result['error']}")
else:
    print("ERROR: No images found in input directory!")

In [None]:
# Cell 6: Process all images
print(f"Processing {len(image_files)} files...")

empty_count = 0
results = []
for i, image_path in enumerate(image_files):
    print(f"Processing {i+1}/{len(image_files)}: {image_path.name}", end="")

    result = process_image(image_path, model, processor)

    if not result.get("raw_text"):
        empty_count += 1
        print(" - WARNING: no text extracted!", end="")

    print()

    # Create output structure
    output = {
        "filename": image_path.name,
        "model_name": "glm-ocr",
        "raw_text": result.get("raw_text", ""),
        "processing_time_seconds": result.get("processing_time_seconds", 0),
        "file_size_bytes": image_path.stat().st_size,
        "success": result.get("success", False)
    }

    if not result["success"]:
        output["error"] = result.get("error", "Unknown error")

    results.append(output)

    # Save individual result
    result_file = OUTPUT_DIR / f"{image_path.stem}_{image_path.suffix.lstrip('.')}.json"
    with open(result_file, "w") as f:
        json.dump(output, f, indent=2)

    # Save raw response
    raw_file = OUTPUT_DIR / "raw" / f"{image_path.stem}_{image_path.suffix.lstrip('.')}_raw.json"
    with open(raw_file, "w") as f:
        json.dump({
            "content": result.get("raw_text", ""),
            "detections": result.get("raw_data", [])
        }, f, indent=2)

print(f"\nCompleted! Processed {len(results)} files")

In [None]:
# Cell 7: Save combined results and summary
all_results_file = OUTPUT_DIR / "all_results.json"
with open(all_results_file, "w") as f:
    json.dump(results, f, indent=2)

# Print summary
successful = sum(1 for r in results if r.get("success", False))
with_text = sum(1 for r in results if r.get("raw_text", ""))
total_time = sum(r.get("processing_time_seconds", 0) for r in results)
print(f"\nSummary:")
print(f"  Total files: {len(results)}")
print(f"  Successful: {successful}")
print(f"  With text extracted: {with_text}")
print(f"  Empty results: {empty_count}")
print(f"  Failed: {len(results) - successful}")
print(f"  Total processing time: {total_time:.1f}s")
print(f"  Average time per file: {total_time/len(results):.2f}s")
print(f"\nResults saved to: {OUTPUT_DIR}")

In [None]:
# Cell 8: Create downloadable ZIP
import shutil

shutil.make_archive(
    "/kaggle/working/glm_ocr",
    'zip',
    OUTPUT_DIR
)
print("ZIP created: /kaggle/working/glm_ocr.zip")
print("Download from the Output tab after notebook completes.")