<a href="https://colab.research.google.com/github/rahiakela/general-utility-notebooks/blob/main/multipage_pdf_to_image_conversion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

**Reference**:

https://mtyurt.net/post/2019/multipage-pdf-to-jpeg-image-in-python.html

In [None]:
%%shell

pip install pillow
pip install pdf2image

In [None]:
!sudo apt install build-essential libpoppler-cpp-dev pkg-config python3-dev
!sudo apt install tesseract-ocr
!sudo apt-get install poppler-utils

In [4]:
import os
import tempfile
from pdf2image import convert_from_path
from PIL import Image

## PDF to Image

In [8]:
def convert_pdf(file_path, output_path):
    # save temp image files in temp dir, delete them after we are finished
    with tempfile.TemporaryDirectory() as temp_dir:

        # convert pdf to multiple image
        images = convert_from_path(file_path, output_folder=temp_dir)

        # save images to temporary directory
        temp_images = []
        for i in range(len(images)):
            image_path = f'{temp_dir}/{i}.jpg'
            images[i].save(image_path, 'JPEG')
            temp_images.append(image_path)

        # read images into pillow.Image
        imgs = list(map(Image.open, temp_images))

    # find maximum width of images
    max_img_width = max(i.width for i in imgs)

    # find total height of all images
    total_height = 0
    for i, img in enumerate(imgs):
        total_height += imgs[i].height

    # create new image object with width and total height
    merged_image = Image.new(imgs[0].mode, (max_img_width, total_height))

    # paste images together one by one
    y = 0
    for img in imgs:
        merged_image.paste(img, (0, y))
        y += img.height

    # save merged image
    merged_image.save(output_path)

    return output_path

In [6]:
!mkdir img_output

In [9]:
output_path = convert_pdf("8_California.pdf", "img_output/8_California.jpg")