## 기본 import & 경로 설정

In [8]:
import os
import cv2
import numpy as np
from pdf2image import convert_from_path
from paddleocr import PaddleOCR
import pytesseract
from tqdm import tqdm

INPUT_DIR = "./ex_pdf"
PADDLE_TXT_DIR = "./outputs/paddle_txt"
TESSERACT_TXT_DIR = "./outputs/tesseract_txt"
os.makedirs(PADDLE_TXT_DIR, exist_ok=True)
os.makedirs(TESSERACT_TXT_DIR, exist_ok=True)



## 엔진 초기화

In [None]:
# PaddleOCR – 한국어 모델
paddle_ocr = PaddleOCR(
    use_angle_cls=True,
    lang="korean",
    rec_model_dir="./models/korean_PP-OCRv3_rec_infer"
)

# Tesseract – 한·영 동시
# 윈도우 PATH 지정 필요 시: pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
TESS_LANG = "kor+eng"
TESS_CONFIG = "--psm 6"


  paddle_ocr = PaddleOCR(use_angle_cls=True, lang="korean")   # GPU 사용 시 use_gpu=True
[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_doc_ori), the model files will be automatically downloaded and saved in C:\Users\user\.paddlex\official_models.[0m
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 6001.87it/s]
[32mCreating model: ('UVDoc', None)[0m
[33mThe model(UVDoc) is not supported to run in MKLDNN mode! Using `paddle` instead![0m
[32mUsing official model (UVDoc), the model files will be automatically downloaded and saved in C:\Users\user\.paddlex\official_models.[0m
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 2999.86it/s]
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_textline_ori), the model files will be automatically downloaded and saved in C:\Users\user\.paddlex\official_models.[0m
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 2999.50it/s]
[32mC

## 페이지 단위 OCR 함수

In [12]:
def ocr_paddle(pdf_path: str) -> str:
    pages = convert_from_path(pdf_path, dpi=300)
    lines = []
    for pil_img in pages:
        img_bgr = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
        result = paddle_ocr.ocr(img_bgr)
        if result and result[0]:
            for line in result[0]:
                text = line[1][0]     # [1] → ['텍스트', confidence]
                lines.append(text)
    return "\n".join(lines)


def ocr_tesseract(pdf_path: str) -> str:
    pages = convert_from_path(pdf_path, dpi=300)
    lines = []
    for pil_img in pages:
        text = pytesseract.image_to_string(pil_img, lang=TESS_LANG, config=TESS_CONFIG)
        lines.append(text)
    return "\n".join(lines)



## 전체 PDF 일괄 처리

In [13]:
pdf_files = [f for f in os.listdir(INPUT_DIR) if f.lower().endswith(".pdf")]

for pdf in tqdm(pdf_files, desc="Processing PDFs"):
    stem = os.path.splitext(pdf)[0]
    path = os.path.join(INPUT_DIR, pdf)

    # PaddleOCR
    paddle_txt = ocr_paddle(path)
    with open(os.path.join(PADDLE_TXT_DIR, f"{stem}.txt"), "w", encoding="utf-8") as f:
        f.write(paddle_txt)

    # Tesseract
    tess_txt = ocr_tesseract(path)
    with open(os.path.join(TESSERACT_TXT_DIR, f"{stem}.txt"), "w", encoding="utf-8") as f:
        f.write(tess_txt)



  result = paddle_ocr.ocr(img_bgr)
Processing PDFs: 100%|██████████| 3/3 [01:13<00:00, 24.62s/it]
