In [1]:
import numpy as np
if not hasattr(np, 'int'):
    np.int = int
if not hasattr(np, 'float'):
    np.float = float

import os
import fitz
from paddleocr import PaddleOCR
import json
from uuid import uuid4
from PIL import Image

In [None]:
PDF_FOLDER = "data/pdfs"
IMAGE_OUTPUT_FOLDER = "data/images"
JSON_OUTPUT_FILE = "data/ocr_output.json"

os.makedirs(IMAGE_OUTPUT_FOLDER, exist_ok=True)


[2025/09/02 18:00:53] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\Rukshan/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\Rukshan/.paddleocr/whl\\rec\\en\\en_PP-OCRv3_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25

In [5]:
def pdf_to_images(pdf_path, output_folder):
    """Convert each page of a PDF into an image and return list of image paths."""
    doc = fitz.open(pdf_path)
    image_paths = []
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        pix = page.get_pixmap(dpi=300)
        image_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page{page_num+1}.png"
        image_path = os.path.join(output_folder, image_name)
        pix.save(image_path)
        image_paths.append(image_path)
    
    return image_paths

In [8]:
for pdf_file in os.listdir(PDF_FOLDER):
    if not pdf_file.lower().endswith(".pdf"):
        continue
    pdf_path = os.path.join(PDF_FOLDER, pdf_file)
    
    image_paths = pdf_to_images(pdf_path, IMAGE_OUTPUT_FOLDER)

In [None]:
ocr = PaddleOCR(
    use_angle_cls=False,
    lang='en'
)

In [None]:

def ocr_image(image_path):
    """Run PaddleOCR on an image and return extracted text + bounding boxes."""
    result = ocr.ocr(image_path, cls=False)
    ocr_data = []
    
    for line in result:
        for word_info in line:
            bbox, (text, confidence) = word_info  # ✅ fix unpacking
            ocr_data.append({
                "bbox": bbox,
                "text": text,
                "confidence": float(confidence)
            })
    
    return ocr_data

In [4]:
all_ocr_results = []

for pdf_file in os.listdir(PDF_FOLDER):
    if not pdf_file.lower().endswith(".pdf"):
        continue
    pdf_path = os.path.join(PDF_FOLDER, pdf_file)
    
    image_paths = pdf_to_images(pdf_path, IMAGE_OUTPUT_FOLDER)
    
    for img_path in image_paths:
        ocr_result = ocr_image(img_path)
        all_ocr_results.append({
            "image_path": img_path,
            "ocr_data": ocr_result
        })

with open(JSON_OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_ocr_results, f, indent=4, ensure_ascii=False)

print(f"OCR extraction completed! JSON saved to {JSON_OUTPUT_FILE}")

[2025/09/02 18:01:03] ppocr DEBUG: dt_boxes num : 45, elapse : 0.4938628673553467
[2025/09/02 18:01:05] ppocr DEBUG: rec_res num  : 45, elapse : 1.516120433807373
OCR extraction completed! JSON saved to data/ocr_output.json
