In [1]:
import os
import re
import glob

import numpy as np

import PyPDF2
from PyPDF2 import PdfFileReader, PdfFileWriter
from fpdf import FPDF

from tkinter import *
from PIL import Image,ImageTk
from pdf2image import convert_from_path

In [2]:
# Set filepath and directory
filepath_PDFs = os.path.join("data","samples","")
directory_PDFs_cleaned_pdf   = os.path.join("data","cleaned", "PDF")
directory_PDFs_added_numbers = os.path.join("data","tmp_data")

In [3]:
# check if input directory exists
if not os.path.isdir(filepath_PDFs):
    raise ValueError("Directory " + filepath_PDFs + " does not exists. Please specify correct data input directory holding the business plans.")

# check if output directory exists; if not create
if not os.path.isdir(directory_PDFs_cleaned_pdf):
    os.makedirs(directory_PDFs_cleaned_pdf)
    
if not os.path.isdir(directory_PDFs_added_numbers):
    os.makedirs(directory_PDFs_added_numbers)

In [4]:
# Get filepath for all business plans in .pdf-Format
filepath_PDFs_list = glob.glob(filepath_PDFs+"*.pdf")

In [5]:
# clean PDFs
# example to put in loop

# define variable to document cleaning in PDF name
keyword_start_tags = "_tags"

## Begin Loop here ##

# define path to PDF
for filepath in filepath_PDFs_list:

    # define name of PDF and add tag
    pdf_name = filepath.replace(filepath_PDFs, "").replace(os.path.join(".pdf"), "")
    pdf_name = func_clean_firmname(pdf_name)
    pdf_name = pdf_name+keyword_start_tags


    # load PDF from directory
    pdf = PyPDF2.PdfFileReader(filepath)

    # get number of pages of PDF
    list_pages = list(range(pdf.getNumPages()))
    
    # add aditional filepath to add pdf with numbers
    filepath_pdf_with_numbers = os.path.join(directory_PDFs_added_numbers, pdf_name+"_addedNumbersOnPages"+".pdf")
    filepath_pdf_tmp_numbers  = os.path.join(directory_PDFs_added_numbers, "tmp"+".pdf")

    # create PDF with numbers on Page to use for cleaning pages
    class NumberPDF(FPDF):
        def __init__(self, numberOfPages):
            super(NumberPDF, self).__init__()
            self.numberOfPages = numberOfPages

        # Overload Header
        def header(self):
            pass

        # Overload Footer
        def footer(self):
            self.set_y(-15)
            self.set_font('Arial', 'I', 30)
            self.cell(0, 10, f"Page {self.page_no()} of {self.numberOfPages}", 0, 0, 'C')


    # Grab the file you want to add pages to
    inputFile = PyPDF2.PdfFileReader(filepath)
    outputFile = filepath_pdf_with_numbers

    # Create a temporary numbering PDF using the overloaded FPDF class, passing the number of pages
    # from your original file
    tempNumFile = NumberPDF(inputFile.getNumPages())

    # Add a new page to the temporary numbering PDF (the footer function runs on add_page and will 
    # put the page number at the bottom, all else will be blank
    for page in range(inputFile.getNumPages()):
        tempNumFile.add_page()

    # Save the temporary numbering PDF
    tempNumFile.output(filepath_pdf_tmp_numbers)

    # Create a new PDFFileReader for the temporary numbering PDF
    mergeFile = PyPDF2.PdfFileReader(filepath_pdf_tmp_numbers)

    # Create a new PDFFileWriter for the final output document
    mergeWriter = PyPDF2.PdfFileWriter()

    # Loop through the pages in the temporary numbering PDF
    for x, page in enumerate(mergeFile.pages):
        # Grab the corresponding page from the inputFile
        inputPage = inputFile.getPage(x)
        # Merge the inputFile page and the temporary numbering page
        inputPage.mergePage(page)
        # Add the merged page to the final output writer
        mergeWriter.addPage(inputPage)

    # Delete the temporary pdf numbers file
    os.remove(filepath_pdf_tmp_numbers)

    # Write the merged output
    with open(outputFile, 'wb') as fh:
        mergeWriter.write(fh)
        
        
        
    ##
    ##                                                             CREATE GUI AS PAGEVIEWER TO DEFINE PAGES TO DELETE
    ##

    # Creating Tk container
    root = Tk()

    # Creating the frame for PDF Viewer
    pdf_frame = Frame(root).pack(fill=BOTH,expand=1)

    # Adding Scrollbar to the PDF frame
    scrol_y = Scrollbar(pdf_frame,orient=VERTICAL)


    # Adding text widget for inserting images
    pdf_image = Text(pdf_frame,yscrollcommand=scrol_y.set,bg="grey")

    # Setting the scrollbar to the right side
    scrol_y.pack(side=RIGHT,fill=Y)
    scrol_y.config(command=pdf_image.yview)

    # Finally packing the text widget
    pdf_image.pack(fill=BOTH,expand=1)

    # Here the PDF is converted to list of images
    pages = convert_from_path(filepath_pdf_with_numbers,size=(1240, 1754))

    # Empty list for storing images
    photos = []

    # Storing the converted images into list
    for i in range(len(pages)):
        photos.append(ImageTk.PhotoImage(pages[i]))

    # Adding all the images to the text widget
    for photo in photos:
        pdf_image.image_create(END,image=photo)

        # For Seperating the pages
        pdf_image.insert(END,'\n\n')

    # create window to insert pages to delete
    width_canvas = 1240

    canvas1 = Canvas(root, width = width_canvas, height = 180)
    canvas1.pack()


    entry1 = Entry (root) 
    canvas1.create_window(width_canvas/2, 35, window=entry1)

    list_pages_to_delete = []
    def get_pages_to_delete():
        # acces variable outside function
        global list_pages_to_delete

        # extend list by entry
        list_pages_to_delete.extend([int(entry1.get())])
        entry1.delete(0, END)

        # sort and filter list for unique values + convert to numpy and filter to be in range of page numbers + cast back to list
        np_pages_to_delete   = np.asarray(sorted(list(set(list_pages_to_delete))))
        list_pages_to_delete = np_pages_to_delete[np.where(np.logical_and(np_pages_to_delete >= 0,
                                                                          np_pages_to_delete <= pdf.getNumPages()
                                                                         )
                                                          )
                                                 ].tolist()

        label1 = Label(root, text = list_pages_to_delete)
        canvas1.create_window(width_canvas/2, 60, window=label1)

    def delete_page_number():
        global list_pages_to_delete
        try:
            list_pages_to_delete.remove(int(entry1.get()))
            entry1.delete(0, END)
        except:
            entry1.delete(0, END)
        label1 = Label(root, text = list_pages_to_delete)
        canvas1.create_window(width_canvas/2, 60, window=label1)


    button1 = Button(text='Add Page Number to be deleted.', command=get_pages_to_delete)
    canvas1.create_window(width_canvas/2, 95, window=button1)

    button2 = Button(text='Delete page number from list.', command=delete_page_number)
    canvas1.create_window(width_canvas/2, 122, window=button2)

    button3 = Button(text='End and close.', command=root.destroy)
    canvas1.create_window(width_canvas/2, 149, window=button3)

    # Ending of mainloop
    root.mainloop()
    
    
    
    
    # Delete the temporary pdf with numbers file
    os.remove(filepath_pdf_with_numbers)
    
    # translate selected "page numbers to delete" to start from 0
    list_pages_to_delete = [x - 1 for x in list_pages_to_delete]
    
    # generate list of clean pages
    list_pages_removed = [page for page in list_pages if page not in list_pages_to_delete]
    
    # create PDF with only clean pages
    pdf_cleaned = PdfFileWriter()
    for page_num in list_pages_removed:
        pdf_cleaned.addPage(pdf.getPage(page_num))

    pdf_name = pdf_name+"_removedPages"

    # save cleaned PDF as .pdf
    with open(os.path.join(directory_PDFs_cleaned_pdf,pdf_name+".pdf"),"wb") as f:
        pdf_cleaned.write(f)
        
        
    #
    #              REMOVE Original PDF from samples folder
    #
    #os.remove(filepath) 

NameError: name 'func_clean_firmname' is not defined