# PaddleOCR Invoice Processing

This notebook processes invoice images using PaddleOCR with GPU acceleration on Kaggle.
Uses the classic stable API (`ocr.ocr()`) for reliable text extraction.

In [None]:
# Cell 1: Install dependencies
# Use PaddlePaddle GPU with CUDA 11.8 (matches Kaggle T4 GPU environment)
!pip install paddlepaddle-gpu==3.2.2 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ --quiet
!pip install "paddleocr>=2.6,<3" PyMuPDF --quiet

In [None]:
# Cell 2: Verify GPU and initialize
import paddle
print(f"PaddlePaddle version: {paddle.__version__}")
print(f"GPU available: {paddle.device.is_compiled_with_cuda()}")
print(f"GPU count: {paddle.device.cuda.device_count()}")

In [None]:
# Cell 3: Setup and initialize PaddleOCR
import os
import json
import time
from pathlib import Path
from paddleocr import PaddleOCR

# Configuration
INPUT_DIR = Path("/kaggle/input/synthetic-invoices")
OUTPUT_DIR = Path("/kaggle/working/paddleocr")
OUTPUT_DIR.mkdir(exist_ok=True)
(OUTPUT_DIR / "raw").mkdir(exist_ok=True)

# Initialize PaddleOCR using the classic stable API
# - use_angle_cls=True: detect rotated text
# - lang='en': use English-optimized models
# - use_gpu=True: leverage Kaggle T4 GPU
# - show_log=False: reduce noise in output
print("Initializing PaddleOCR...")
ocr = PaddleOCR(
    use_angle_cls=True,
    lang='en',
    use_gpu=True,
    show_log=False
)
print("PaddleOCR initialized!")

In [None]:
# Cell 4: Define processing function
def process_image(image_path, ocr):
    """Process a single image with PaddleOCR using the classic stable API."""
    start_time = time.time()
    
    try:
        # Run OCR using the classic ocr.ocr() method (stable, well-tested)
        result = ocr.ocr(str(image_path), cls=True)
        
        # Extract text from results
        # Classic API returns: [page1_results, page2_results, ...]
        # Each page result is a list of: [bbox, (text, confidence)]
        lines = []
        raw_data = []
        
        if result:
            for page in result:
                if page:
                    for line in page:
                        bbox, (text, confidence) = line
                        if text:  # Skip empty strings
                            lines.append(text)
                            raw_data.append({
                                "text": text,
                                "confidence": float(confidence),
                                "bbox": bbox
                            })
        
        full_text = "\n".join(lines)
        processing_time = time.time() - start_time
        
        return {
            "success": True,
            "raw_text": full_text,
            "raw_data": raw_data,
            "processing_time_seconds": round(processing_time, 3)
        }
    except Exception as e:
        import traceback
        return {
            "success": False,
            "error": f"{str(e)}\n{traceback.format_exc()}",
            "processing_time_seconds": time.time() - start_time
        }

In [None]:
# Cell 5: Test on a single image before batch processing
# This validates that the OCR pipeline is working correctly
image_files = sorted(list(INPUT_DIR.glob("*.png")) + list(INPUT_DIR.glob("*.jpeg")) + list(INPUT_DIR.glob("*.pdf")))
print(f"Found {len(image_files)} images total")

if image_files:
    test_img = image_files[0]
    print(f"\nTesting on: {test_img.name}")
    
    # Run OCR and print raw result for debugging
    test_result = ocr.ocr(str(test_img), cls=True)
    print(f"Result type: {type(test_result)}")
    print(f"Number of pages: {len(test_result) if test_result else 0}")
    
    if test_result and test_result[0]:
        print(f"Detections on page 1: {len(test_result[0])}")
        # Show first 3 detections as a sample
        for i, line in enumerate(test_result[0][:3]):
            bbox, (text, confidence) = line
            print(f"  [{i}] text='{text}', confidence={confidence:.4f}")
        print("  ...")
        print("\nOCR pipeline is working correctly!")
    else:
        print("\nWARNING: No text detected! Check GPU/model initialization.")
        print(f"Raw result: {test_result}")
else:
    print("ERROR: No images found in input directory!")

In [None]:
# Cell 6: Process all images
# image_files was already populated in the test cell above
print(f"Processing {len(image_files)} images...")

# Process all images
empty_count = 0
results = []
for i, image_path in enumerate(image_files):
    print(f"Processing {i+1}/{len(image_files)}: {image_path.name}", end="")
    
    result = process_image(image_path, ocr)
    
    if not result.get("raw_text"):
        empty_count += 1
        print(f" - WARNING: no text extracted!", end="")
    
    print()
    
    # Create output structure
    output = {
        "filename": image_path.name,
        "model_name": "paddleocr",
        "raw_text": result.get("raw_text", ""),
        "processing_time_seconds": result.get("processing_time_seconds", 0),
        "file_size_bytes": image_path.stat().st_size,
        "success": result.get("success", False)
    }
    
    if not result["success"]:
        output["error"] = result.get("error", "Unknown error")
    
    results.append(output)
    
    # Save individual result
    result_file = OUTPUT_DIR / f"{image_path.stem}_{image_path.suffix.lstrip('.')}.json"
    with open(result_file, "w") as f:
        json.dump(output, f, indent=2)
    
    # Save raw response
    raw_file = OUTPUT_DIR / "raw" / f"{image_path.stem}_{image_path.suffix.lstrip('.')}_raw.json"
    with open(raw_file, "w") as f:
        json.dump({
            "content": result.get("raw_text", ""),
            "detections": result.get("raw_data", [])
        }, f, indent=2)

print(f"\nCompleted! Processed {len(results)} images")

In [None]:
# Cell 7: Save combined results and summary
# Save all results to single file
all_results_file = OUTPUT_DIR / "all_results.json"
with open(all_results_file, "w") as f:
    json.dump(results, f, indent=2)

# Print summary
successful = sum(1 for r in results if r.get("success", False))
with_text = sum(1 for r in results if r.get("raw_text", ""))
total_time = sum(r.get("processing_time_seconds", 0) for r in results)
print(f"\nSummary:")
print(f"  Total images: {len(results)}")
print(f"  Successful: {successful}")
print(f"  With text extracted: {with_text}")
print(f"  Empty results: {empty_count}")
print(f"  Failed: {len(results) - successful}")
print(f"  Total processing time: {total_time:.1f}s")
print(f"  Average time per image: {total_time/len(results):.2f}s")
print(f"\nResults saved to: {OUTPUT_DIR}")

In [None]:
# Cell 8: Create downloadable ZIP
import shutil

shutil.make_archive(
    "/kaggle/working/paddle_ocr",
    'zip',
    OUTPUT_DIR
)
print("ZIP created: /kaggle/working/paddle_ocr.zip")
print("Download from the Output tab after notebook completes.")