# This example uses arcpy, PyPDF2, & PDFMiner to take a map series generated in ArcGIS and export the maps to pdf, then make links of the matchline text to jump you to the appropriate pages

### We'll start in arcpy
If you are working in ArcMap (sorry, not ArcGIS Pro) and you have data-driven pages setup in your mxd, this will produce a multi-page pdf export for you and also harvest the page names to use for bookmarks.

In [None]:
import arcpy, os

#Creates two empty pdf documents
print("Creating pdf document")
out_dir = r'/Users/shannon/Desktop'
output_pdf_path_fromArcMap = os.path.join(out_dir, 'DeltaEcology.pdf')
output_pdf_fromArcMap = arcpy.mapping.PDFDocumentCreate(output_pdf_path_fromArcMap)

#Grabs some variables from the mxd
mxd = arcpy.mapping.MapDocument(r'extras/MySampleMXD.mxd')
ddp = mxd.dataDrivenPages #Note that if you don't have data driven pages enabled in your map, this will throw an error

#For each page in the data driven page set...
temp_pdf_path = os.path.join(out_dir,'temp.pdf')
bookmark_list = []
for page_num in range(1, mxd.dataDrivenPages.pageCount + 1):
    ddp.currentPageID = page_num #Go to page
    page_name = ddp.pageRow.getValue(ddp.pageNameField.name) #Get the name of the page
    print (" Exporting " + page_name)
    arcpy.mapping.ExportToPDF(mxd, temp_pdf_path) #Export the pdf to a temporary location
    output_pdf_fromArcMap.appendPages(temp_pdf_path) #Append the temporary pdf to the main pdf
    os.remove(temp_pdf_path)
    bookmark_list.append(page_name) #This is a list of the bookmarks we need to make
output_pdf_fromArcMap.saveAndClose()
print(bookmark_list)

In [77]:
#Demo purposes only...
import os
out_dir = r'/Users/shannon/Desktop'
output_pdf_path_fromArcMap = os.path.join(out_dir, 'DeltaEcology.pdf')
bookmark_list = ['Page 1', 'Page 2', 'Page 3', 'Page 4', 'Page 5']

### The next step is to use PDFMiner to make a list of text boxes in the PDF

In [78]:
# What follows is a ridiculous number of imports to run only 20 lines of code, but so be it.
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure

# Opens the PDF file in PDFMiner
pdf_for_pdfminer = open(output_pdf_path_fromArcMap, 'rb')
parser = PDFParser(pdf_for_pdfminer)
doc = PDFDocument(parser)
my_text_boxes = []

def clean_my_text(str):
    str = str.replace(" ","")
    str = str.replace('\n',"")
    return str

# This is a function to dig through a text box in the pdf recursively
def parse_layout(layout, pagenum):
    """Function to recursively parse the layout tree."""
    for lt_obj in layout:
        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
            if "LTTextBox" in lt_obj.__class__.__name__:
                my_text = lt_obj.get_text()
                for bookmark in bookmark_list:
                    if bookmark in my_text or clean_my_text(bookmark) in clean_my_text(my_text):
                        out_object = [pagenum, lt_obj.bbox, bookmark]
                        my_text_boxes.append(out_object)
        elif isinstance(lt_obj, LTFigure):
            parse_layout(lt_obj, pagenum)  # in case its a text box inside a text box inside a...
            
# And now, for each page in the PDF, look for text boxes that meet our criteria,
# and get back a list of the bounding coords
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
my_text_boxes = []
bad_objects = []
for pagenumber, page in enumerate(PDFPage.create_pages(doc)):
    interpreter.process_page(page)
    layout = device.get_result()
    parse_layout(layout, pagenumber)
for my_text_box in my_text_boxes:
    print my_text_box
    
pdf_for_pdfminer.close()

[0, (273.2386702, 194.4906462, 305.10977009, 222.58327519999997), 'Page 2']
[0, (313.5634, 194.4906462, 345.43449989, 222.58327519999997), 'Page 3']
[0, (487.2052, 147.5574231, 512.7316823, 170.1687376), 'Page 1']
[0, (468.96520970000006, 108.67863399999999, 494.49169200000006, 131.28994849999998), 'Page 2']
[0, (506.1659194, 108.67863399999999, 531.6924017, 131.28994849999998), 'Page 3']
[0, (475.6814545000001, 70.03744189999999, 501.2079368000001, 92.6487564), 'Page 4']
[0, (512.8821642, 70.03744189999999, 538.4086465, 92.6487564), 'Page 5']
[1, (365.7639, 742.3999461999999, 397.63499989, 770.4925752), 'Page 1']
[1, (562.8850169, 460.13567178, 590.9785023999999, 492.0058), 'Page 3']
[1, (354.0038, 194.2506462, 385.87489989000005, 222.3432752), 'Page 4']
[1, (487.2052, 147.5574231, 512.7316823, 170.1687376), 'Page 1']
[1, (468.96520970000006, 108.67863399999999, 494.49169200000006, 131.28994849999998), 'Page 2']
[1, (506.1659194, 108.67863399999999, 531.6924017, 131.28994849999998), '

### Okay we have what need from PDFMiner. Let's move on to PyPDF.
Now we're going to make a bookmark for every page, and then link every place we found the right sort of text box to the bookmark.

In [79]:
from PyPDF2 import PdfFileWriter, PdfFileReader
from PyPDF2.pdf import RectangleObject

# Open up the pdf reader
pdf_for_pypdf_reader = PdfFileReader(open(output_pdf_path_fromArcMap,'rb'))

# Designate the output file
writer = PdfFileWriter()

# Now let's go through the final pdf and set up the bookmarks
for i in range(0, len(bookmark_list)):
    writer.addPage(pdf_for_pypdf_reader.getPage(i))
    writer.addBookmark(bookmark_list[i], i)

#Now we're going to go through every text box and if it has a matching bookmark, we'll make it a link
for my_text_box in my_text_boxes:
    page_num = my_text_box[0]
    rect = RectangleObject(my_text_box[1]) 
    link_dest = bookmark_list.index(my_text_box[2])
    writer.addLink(page_num, link_dest, rect)

#And finally, write the results out to a new file  
with open(os.path.join(out_dir, 'DeltaEcology_withBookmarksAndLinks.pdf'), 'wb') as output_pdf_fromPyPDF:
    writer.write(output_pdf_fromPyPDF)