# 🚀 Fast OCR Processor with GPU Acceleration

**Upload your PDF and get OCR results in ~30 seconds!**

This notebook uses Surya OCR with GPU acceleration for lightning-fast processing.


In [None]:
# Install required packages
!pip install surya-ocr fastapi uvicorn python-multipart -q

print("✅ Packages installed successfully!")
print("🔧 GPU available:", torch.cuda.is_available() if 'torch' in globals() else 'Checking...')


In [None]:
import os
import sys
from datetime import datetime
import json
from pathlib import Path
from typing import Dict, Any

# Import Surya OCR
from surya.ocr import run_ocr
from surya.model.detection import load_model as load_det_model, load_processor as load_det_processor
from surya.model.recognition import load_model as load_rec_model, load_processor as load_rec_processor
from PIL import Image
import fitz  # PyMuPDF

print("✅ Imports successful!")
print("🚀 Ready for fast OCR processing!")


In [None]:
# Load Surya models (this will download models on first run)
print("🔄 Loading Surya OCR models...")
print("⏱️ This may take 1-2 minutes on first run...")

det_processor, det_model = load_det_processor(), load_det_model()
rec_model, rec_processor = load_rec_model(), load_rec_processor()

print("✅ Models loaded successfully!")
print("🎯 Ready for GPU-accelerated OCR!")


In [None]:
from google.colab import files
import io

print("📄 Upload your PDF file:")
uploaded = files.upload()

if uploaded:
    pdf_filename = list(uploaded.keys())[0]
    print(f"✅ Uploaded: {pdf_filename}")
    print(f"📊 File size: {len(uploaded[pdf_filename])} bytes")
else:
    print("❌ No file uploaded")


In [None]:
def pdf_to_images(pdf_path):
    """Convert PDF to images"""
    doc = fitz.open(pdf_path)
    images = []
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        # Render at 2x scale for better OCR quality
        mat = fitz.Matrix(2.0, 2.0)
        pix = page.get_pixmap(matrix=mat)
        img_data = pix.tobytes("png")
        img = Image.open(io.BytesIO(img_data))
        images.append(img)
    
    doc.close()
    return images

def process_ocr_fast(images, languages=["en", "ar"]):
    """Process images with Surya OCR using GPU acceleration"""
    print(f"🚀 Processing {len(images)} pages with GPU acceleration...")
    
    start_time = datetime.now()
    
    # Run OCR with GPU acceleration
    predictions = run_ocr(
        images, 
        [languages] * len(images),
        det_model, 
        det_processor,
        rec_model, 
        rec_processor
    )
    
    end_time = datetime.now()
    processing_time = end_time - start_time
    
    # Extract text from predictions
    full_text = ""
    total_confidence = 0
    total_lines = 0
    
    for i, prediction in enumerate(predictions):
        page_text = ""
        page_confidence = 0
        page_lines = 0
        
        for line in prediction.text_lines:
            if line.text.strip():
                page_text += line.text + "\n"
                page_confidence += line.confidence
                page_lines += 1
        
        if page_text.strip():
            full_text += f"\n--- Page {i+1} ---\n{page_text}\n"
            total_confidence += page_confidence
            total_lines += page_lines
    
    avg_confidence = (total_confidence / total_lines) * 100 if total_lines > 0 else 0
    
    return {
        "extraction_status": "success",
        "extraction_method": "Surya OCR (GPU Accelerated)",
        "raw_text": full_text.strip(),
        "text_length": len(full_text.strip()),
        "confidence": round(avg_confidence, 1),
        "processing_time": str(processing_time),
        "pages_processed": len(images),
        "lines_extracted": total_lines
    }

print("✅ OCR processing functions ready!")


In [None]:
# Process the uploaded PDF
if 'pdf_filename' in locals():
    print(f"🔄 Converting PDF to images...")
    images = pdf_to_images(pdf_filename)
    print(f"📷 Converted to {len(images)} images")
    
    print(f"\n🚀 Starting GPU-accelerated OCR...")
    result = process_ocr_fast(images)
    
    print(f"\n✅ OCR COMPLETE!")
    print(f"📊 Results:")
    print(f"   • Status: {result['extraction_status']}")
    print(f"   • Method: {result['extraction_method']}")
    print(f"   • Text Length: {result['text_length']} characters")
    print(f"   • Confidence: {result['confidence']}%")
    print(f"   • Processing Time: {result['processing_time']}")
    print(f"   • Pages: {result['pages_processed']}")
    print(f"   • Lines: {result['lines_extracted']}")
    
    # Store result for download
    ocr_result = result
else:
    print("❌ Please upload a PDF file first")


In [None]:
# 📖 View Extracted Text Results
if 'ocr_result' in locals():
    print("🔍 EXTRACTED TEXT PREVIEW:")
    print("=" * 50)
    
    # Show first 1000 characters
    text_preview = ocr_result['raw_text'][:1000]
    print(text_preview)
    
    if len(ocr_result['raw_text']) > 1000:
        print(f"\n... (showing first 1000 of {len(ocr_result['raw_text'])} characters)")
        print("\n💡 To see full text, run the cell below")
    
    print("\n" + "=" * 50)
    print(f"📊 Summary: {ocr_result['text_length']} characters, {ocr_result['confidence']}% confidence")
else:
    print("❌ No OCR results found. Please run the OCR processing first.")


In [None]:
# 📄 View Full Extracted Text
if 'ocr_result' in locals():
    print("📄 FULL EXTRACTED TEXT:")
    print("=" * 80)
    print(ocr_result['raw_text'])
    print("=" * 80)
    print(f"✅ Total: {ocr_result['text_length']} characters extracted")
else:
    print("❌ No OCR results found. Please run the OCR processing first.")


In [None]:
# 💾 Download Results as JSON
if 'ocr_result' in locals():
    import json
    from datetime import datetime
    
    # Create filename with timestamp
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f"tenancy_contract_ocr_{timestamp}.json"
    
    # Save to file
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(ocr_result, f, indent=2, ensure_ascii=False)
    
    print(f"💾 Results saved to: {filename}")
    
    # Download the file
    from google.colab import files
    files.download(filename)
    
    print("✅ File downloaded to your computer!")
    print("🔄 You can now use this data in your Cursor project!")
else:
    print("❌ No OCR results found. Please run the OCR processing first.")


In [None]:
# Download results as JSON
if 'ocr_result' in locals():
    # Save results to file
    result_filename = f"ocr_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    
    with open(result_filename, 'w', encoding='utf-8') as f:
        json.dump(ocr_result, f, indent=2, ensure_ascii=False)
    
    print(f"💾 Results saved to: {result_filename}")
    
    # Download the file
    from google.colab import files
    files.download(result_filename)
    
    print("✅ Results downloaded to your computer!")
    print("\n🔄 You can now use this data in your Cursor project!")
else:
    print("❌ No results to download")
