In [None]:
# default_exp core

In [None]:
%load_ext autoreload
%autoreload 2

# pdfsplitter

> A simple way to extract and parse images for machine learning workflows.

In [None]:
#hide
from nbdev.showdoc import *

Our base functionality is fairly simple. It must be able to the following:

- open a PDF file
- iterate through the pages of the file
- for each page, save that page with a counter str on the end as an image file

In [None]:
#export
import fitz
from pathlib import Path
import pathlib
import os
from typing import Any

In [None]:
#export
def pdf_to_img(pdf_path: "os.PathLike[Any]", destination_path: "os.PathLike[Any]", img_type: str, export_quality_factor=2.0) -> None:
    """Converts a PDF file into a series of image files.
    
    Each image file is labelled with its page number"""
    destination_path.mkdir(parents=True, exist_ok=True) # create the destination directory if it doesn't already exist
    pdf_obj = fitz.open(pdf_path)
    mat = fitz.Matrix(export_quality_factor, export_quality_factor)
    for page_number in range(len(pdf_obj)):
        page = pdf_obj.load_page(page_number)
        pix = page.get_pixmap(matrix=mat)  # use 'mat' instead of the identity matrix
        pix.save(f"{str(destination_path)}/{pdf_path.name[:-4]}-{page_number + 1}.{img_type}")

In [None]:
#export
def extract_images_from_pdfs(source_folder: "os.PathLike[Any]", destination_folder: "os.PathLike[Any]", img_type: str, export_quality_factor=2.0):
    """Converts all PDF files inside a particular source folder into individual image files.
    
    Each PDF file exports a single image for each page. 
    You can specify the type of image you want. See 
    https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-convert-images for
    a full list of support export options."""
    files = list(source_folder.glob("*.pdf"))
    for file in files:
        pdf_to_img(file, destination_folder, "png")

In [None]:
#export
def say_name(name="alex"):
    print(name)

In [None]:
source = Path("./tryout/")
destination = Path("./tryout/processed")
extract_images_from_pdfs(source, destination, "png")

In [None]:
# TRY SOME TESTING HERE
# assert say_hello("Jeremy")=="Hello Jeremy!"