### Different PDF parsers

- `PyMuPDF` is a Python wrapper for the MuPDF library, which is a lightweight PDF and XPS viewer and parser. It can be used to extract text, images, and other data from PDF files, as well as to manipulate PDF files programmatically. It provides a comprehensive set of tools for working with PDF files, including merging and splitting PDFs, adding annotations and bookmarks, and converting PDFs to other formats.

In [None]:
%pip install PyMuPDF
%pip install pdf2image pytesseract pillow PyMuPDF
%brew install tesseract
%which tesseract
%brew install poppler

In [None]:
import os
import pytesseract
from PIL import Image
from pdf2image import convert_from_path

# Set Tesseract path
pytesseract.pytesseract.tesseract_cmd = r'/somepath/tesseract'

# PDF file path
pdf_file_path = '/somepath/test.pdf'

# Convert PDF to images
images = convert_from_path(pdf_file_path)

# Save images as OCR-like JPEGs
ocr_image_dir = os.path.join(os.path.dirname(pdf_file_path), 'ocr_images')
if not os.path.exists(ocr_image_dir):
    os.makedirs(ocr_image_dir)

ocr_image_paths = []
for i, img in enumerate(images):
    ocr_image_path = os.path.join(ocr_image_dir, f'ocr_image_{i}.jpeg')
    img.save(ocr_image_path, 'JPEG')
    ocr_image_paths.append(ocr_image_path)

# Perform OCR using Tesseract
full_text = ''
for ocr_image_path in ocr_image_paths:
    img = Image.open(ocr_image_path)
    text = pytesseract.image_to_string(img)
    full_text += text + '\n'

# Save the OCR text to a file
ocr_text_path = os.path.join(os.path.dirname(pdf_file_path), 'ocr_text.txt')
with open(ocr_text_path, 'w') as f:
    f.write(full_text)
