# Nanonets OCR Testing and Analysis

This notebook tests and analyzes the Nanonets OCR model for policy document processing.
It demonstrates the issues with the model and provides working solutions.


## 1. Setup and Dependencies


In [1]:
import os
import sys
import time
from pathlib import Path
import logging
from PIL import Image
import torch
from pdf2image import convert_from_path
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
import gc
import pandas as pd

# Add the scripts directory to the path so we can import from pdf_to_markdown
sys.path.append('../scripts')

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("✅ Dependencies imported successfully")


ModuleNotFoundError: No module named 'pdf2image'

## 2. Environment Configuration


In [None]:
# Set environment variables for optimization
os.environ["TRANSFORMERS_VERBOSITY"] = "info"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

print("🔧 Environment configured for OCR testing")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("Using CPU for processing")


## 3. Import and Test Existing PDF Processing Methods


In [None]:
# Import the existing PDF to markdown converter
try:
    from pdf_to_markdown import NanonetsOCRConverter
    print("✅ Successfully imported NanonetsOCRConverter from pdf_to_markdown")
except ImportError as e:
    print(f"❌ Failed to import: {e}")
    print("We'll define a simplified version for testing")


## 4. Analyze Available PDF Documents


In [None]:
# Check available PDF files
policy_dir = Path("../policy_corpus")
pdf_files = list(policy_dir.glob("*.pdf")) if policy_dir.exists() else []

print(f"📁 Found {len(pdf_files)} PDF files:")
for pdf in pdf_files:
    size_mb = pdf.stat().st_size / (1024 * 1024)
    print(f"   - {pdf.name}: {size_mb:.1f} MB")

if not pdf_files:
    print("⚠️ No PDF files found in policy_corpus directory")
else:
    # Select the first PDF for testing
    test_pdf = pdf_files[0]
    print(f"\n🎯 Using {test_pdf.name} for testing")


## 5. Test PDF to Image Conversion


In [None]:
def test_pdf_to_images(pdf_path: Path, max_pages: int = 2, dpi: int = 150):
    """Test PDF to image conversion with timing."""
    
    print(f"🔄 Converting first {max_pages} pages of {pdf_path.name} at {dpi} DPI...")
    start_time = time.time()
    
    try:
        images = convert_from_path(
            pdf_path,
            dpi=dpi,
            fmt='RGB',
            first_page=1,
            last_page=max_pages
        )
        
        conversion_time = time.time() - start_time
        
        print(f"✅ Converted {len(images)} pages in {conversion_time:.2f} seconds")
        
        # Analyze image properties
        for i, img in enumerate(images, 1):
            print(f"   Page {i}: {img.width}x{img.height} pixels, mode: {img.mode}")
        
        return images
        
    except Exception as e:
        print(f"❌ Failed to convert PDF: {e}")
        return []

if pdf_files:
    test_images = test_pdf_to_images(test_pdf, max_pages=2)
