# Merging PDFs : Combine multiple PDF files into one

In [1]:
# installing essential libraries
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [5]:
# Merging multiple PDF files

import PyPDF2

def merge_pdfs(pdf_list, output_path):
    pdf_writer = PyPDF2.PdfWriter()
    for pdf in pdf_list:
        pdf_reader = PyPDF2.PdfReader(pdf)
        for page_num in range(len(pdf_reader.pages)):
            pdf_writer.add_page(pdf_reader.pages[page_num])

    with open(output_path, 'wb') as out:
        pdf_writer.write(out)
    print(f"Merged PDF saved as {output_path}")


# let's use the above function

# Provide the output path for the merged PDF
merge_pdfs(['Page+1.pdf','Page+2.pdf','Page+3.pdf', 'Page+4.pdf'], 'merged.pdf')

# our output file will be merged.pdf

Merged PDF saved as merged.pdf


In [6]:
# splitting PDF file yo multiple PDF files / pages

import PyPDF2

def split_pdf(pdf_path, output_dir):
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    for page_num in range(len(pdf_reader.pages)):
        pdf_writer = PyPDF2.PdfWriter()
        pdf_writer.add_page(pdf_reader.pages[page_num])
        output_path = f"{output_dir}/page_{page_num + 1}.pdf"

        with open(output_path, 'wb') as out:
            pdf_writer.write(out)
        print(f" Saved {output_path}")

# let's use the above function

split_pdf('merged.pdf', 'pdf_files')

 Saved pdf_files/page_1.pdf
 Saved pdf_files/page_2.pdf
 Saved pdf_files/page_3.pdf
 Saved pdf_files/page_4.pdf


In [7]:
# extract text from PDF files
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [9]:
import pdfplumber

def extract_text(pdf_path, output_text_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = ""
        for page in pdf.pages:
            full_text += page.extract_text() + "\n"

        with open(output_text_path, 'w') as f:
            f.write(full_text)
        print(f"extracted text is saved as{output_text_path}")

# let's use the above function

extract_text('Page+1.pdf', 'output.txt')

extracted text is saved asoutput.txt


In [10]:
# extract images from the PDF file
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.9 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.9 PyMuPDFb-1.24.9


In [11]:
# extract images from the PDF file

import fitz  # PyMuPDF

def extract_images(pdf_path, output_dir):
    pdf_document = fitz.open(pdf_path)
    for page_index in range(len(pdf_document)):
        page = pdf_document.load_page(page_index)
        image_list = page.get_images(full=True)

        for image_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"{output_dir}/image_{page_index + 1}_{image_index + 1}.{image_ext}"

            with open(image_filename, "wb") as image_file:
                image_file.write(image_bytes)
            print(f"Saved {image_filename}")

# let's use the above function

extract_images('Page+1.pdf', 'folder')

Saved folder/image_1_1.jpeg


In [12]:
# creating password protected PDFs (encrypted PDF file).

import PyPDF2

def encrypt_pdf(input_pdf, output_pdf, password):
    pdf_reader = PyPDF2.PdfReader(input_pdf)
    pdf_writer = PyPDF2.PdfWriter()
    for page_num in range(len(pdf_reader.pages)):
        pdf_writer.add_page(pdf_reader.pages[page_num])

    pdf_writer.encrypt(password)

    with open(output_pdf, 'wb') as out:
        pdf_writer.write(out)
    print(f"Encrypted PDF saved as {output_pdf}")


# let's use the above function

encrypt_pdf('merged.pdf', 'encrypted.pdf', 'pass123')


Encrypted PDF saved as encrypted.pdf


In [15]:
# decrypting PDFs : remove password protection from PDF files

def decrypt_pdf(input_pdf, output_pdf, password):
    pdf_reader = PyPDF2.PdfReader(input_pdf)
    pdf_reader.decrypt(password)
    pdf_writer = PyPDF2.PdfWriter()
    for page_num in range(len(pdf_reader.pages)):
        pdf_writer.add_page(pdf_reader.pages[page_num])


    with open(output_pdf, 'wb') as out:
        pdf_writer.write(out)
    print(f"Decrypted PDF saved as {output_pdf}")


# let's use the above function

decrypt_pdf('encrypted.pdf', 'decrypted.pdf', 'pass123')

Decrypted PDF saved as decrypted.pdf


In [16]:
# rearranging PDF Pages : reorder or delete pages within a PDF

def rearrange_pages(input_pdf, output_pdf, page_order):
    pdf_reader = PyPDF2.PdfReader(input_pdf)
    pdf_writer = PyPDF2.PdfWriter()
    for page_num in page_order:
        pdf_writer.add_page(pdf_reader.pages[page_num])

    with open(output_pdf, 'wb') as out:
        pdf_writer.write(out)
    print(f"Rearranged PDF saved as {output_pdf}")


# let's use the above function

rearrange_pages('merged.pdf', 'rearrange.pdf', [2,1,0])

#[2,1,0] means file pages in reverse order

Rearranged PDF saved as rearrange.pdf


In [17]:
# rotating pages : rotate individual pages in a PDF

def rotate_pages(input_pdf, output_pdf, rotation):
    pdf_reader = PyPDF2.PdfReader(input_pdf)
    pdf_writer = PyPDF2.PdfWriter()
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        page.rotate(rotation)
        pdf_writer.add_page(page)

    with open(output_pdf, 'wb') as out:
        pdf_writer.write(out)
    print(f"Rotated PDF saved as {output_pdf}")


# let's use the above function

rotate_pages('merged.pdf', 'rotated.pdf', 90)

Rotated PDF saved as rotated.pdf


In [18]:
# adding metadata : read, add or modify metadata (title, author, etc)

def read_metadata(pdf_file):
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    metadata = pdf_reader.metadata

    print("Metadata of the PDF file is :")
    for key, value in metadata.items():
        print(f"{key}: {value}")


# let's use the above function

read_metadata('merged.pdf')

Metadata of the PDF file is :
/Producer: PyPDF2


In [19]:
def add_metadata(input_pdf, output_pdf, tilte, author):
    pdf_reader = PyPDF2.PdfReader(input_pdf)
    pdf_writer = PyPDF2.PdfWriter()
    for page_num in range(len(pdf_reader.pages)):
        pdf_writer.add_page(pdf_reader.pages[page_num])

    metadata = {
        '/Title': tilte,
        '/Author': author
    }
    pdf_writer.add_metadata(metadata)

    with open(output_pdf, 'wb') as out:
        pdf_writer.write(out)
    print(f"Metadata added to PDF saved as {output_pdf}")


# let's use the above

add_metadata('merged.pdf', 'metadata.pdf', 'Sample File', 'Sonika')

Metadata added to PDF saved as metadata.pdf


In [20]:
def read_metadata(pdf_file):
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    metadata = pdf_reader.metadata

    print("Metadata of the PDF file is :")
    for key, value in metadata.items():
        print(f"{key}: {value}")


# let's use the above function

read_metadata('metadata.pdf')

Metadata of the PDF file is :
/Producer: PyPDF2
/Title: Sample File
/Author: Sonika


In [21]:
# optimize the file of the PDF file (compressing PDF file)
import fitz  # PyMuPDF

def optimize_pdf(input_file, output_file):
    pdf_document = fitz.open(input_file)
    pdf_document.save(output_file, garbage=4, deflate=True)
    print(f"Optimized PDF saved as {output_file}")


# let's use the above function

optimize_pdf('merged.pdf', 'optimized.pdf')

Optimized PDF saved as optimized.pdf


In [23]:
# building a file handling tool : creating executable ( .exe) files from python code

# example code : merging multiple PDF files


import PyPDF2
import os


def merge_pdfs(input_folder, output_pdf):
# remove the existing merged.pdf file if it exists.

    if os.path.exists(output_pdf):
        os.remove(output_pdf)

# get a list of PDF files in the input folder
    pdf_files = [file for file in os.listdir(input_folder) if file.endswith('.pdf')]

# sort the list of PDF files (this is optional)
    pdf_files.sort()

# create a PDF writer object
    pdf_writer = PyPDF2.PdfWriter() # Moved the pdf_writer initialization outside the loop
    for pdf in pdf_files:
        with open(os.path.join(input_folder, pdf), 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f) # Changed 'file' to 'f' to match the file object
            for page_num in range(len(pdf_reader.pages)):
                pdf_writer.add_page(pdf_reader.pages[page_num])

# write the merged PDF to the output file.
    with open(output_pdf, 'wb') as out:
        pdf_writer.write(out)
    print(f"Merged PDF saved as {output_pdf}")

# Let's use the above function

input_folder = 'pdf_files' # path to the folder containing PDF files to merge
output_pdf = 'pdf_files/merged.pdf' # path to the output merged PDF file

merge_pdfs(input_folder, output_pdf) # Removed extra space in 'output_pdf ' and passed the variables, not strings

Merged PDF saved as pdf_files/merged.pdf
