# This example uses arcpy, PyPDF2, & PDFMiner to take a map series generated in ArcGIS and export the maps to pdf, then make bookmarks of the matchline text.

### First, we'll import PDFMiner to make a list of text boxes...

In [None]:
# What follows is a ridiculous number of imports to run only 20 lines of code, but so be it.
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure

In [None]:
# Opens a PDF file
my_pdf_path = '/Users/shannon/Desktop/DeltaEcology.pdf'
my_pdf = open(my_pdf_path, 'rb')
parser = PDFParser(my_pdf)
doc = PDFDocument(parser)
my_text_boxes = []

In [None]:
# This is a function to dig through a text box in the pdf recursively
def parse_layout(layout, pagenum, text_bit):
    """Function to recursively parse the layout tree."""
    for lt_obj in layout:
        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
            if "LTTextBox" in lt_obj.__class__.__name__:
                my_text = lt_obj.get_text()
                if text_bit in my_text:
                    out_object = [pagenum, lt_obj.bbox, my_text]
                    my_text_boxes.append(out_object)
        elif isinstance(lt_obj, LTFigure):
            parse_layout(lt_obj, pagenum, text_bit)  # in case its a text box inside a text box inside a...

In [70]:
# And now, for each page in the PDF, look for text boxes that meet our criteria,
# and get back a list of the bounding coords
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
my_text_boxes = []
for pagenumber, page in enumerate(PDFPage.create_pages(doc)):
    interpreter.process_page(page)
    layout = device.get_result()
    parse_layout(layout, pagenumber, "Page")
for my_text_box in my_text_boxes:
    print my_text_box

[0, (439.44471, 191.36202637000002, 484.3835478495, 231.10285552000002), u'Page 3\n']
[0, (169.44182, 191.36202637000002, 214.38065784949998, 231.10285552000002), u'Page 2\n']
[1, (397.44426, 737.35138637, 442.3830978495, 777.09221552), u'Page 1\n']
[1, (530.88569, 464.23670637, 575.8245278494999, 503.97753552), u'Page 3\n']
[1, (337.68362, 191.84201637, 382.62245784950005, 231.58284552), u'Page 4\n']
[2, (127.44137, 737.35138637, 172.38020784949998, 777.09221552), u'Page 1\n']
[2, (36.00039, 464.23670637, 80.93922784949999, 503.97753552), u'Page 2\n']
[2, (67.68073, 33.557181889999995, 234.05468704399996, 231.58284552), u'Page 4\nopen water\nfluvial open water\ntidal freshwater emergent wetland\nnon-tidal freshwater emergent wetland\nwillow thicket\nwillow riparian scrub/shrub\nvalley foothill riparian\nwet meadow/seasonal wetland\nvernal pool complex\nalkali seasonal wetland complex\nstabilized interior dune vegetation\ngrassland\noak woodland/savanna\n']
[2, (337.68362, 191.84201637

### Okay we have what need from PDFMiner. Let's move on to PyPDF.
Now we're going to make a link every place we found the right sort of text box, and then finally output a new file.

In [76]:
from PyPDF2 import PdfFileWriter, PdfFileReader
from PyPDF2.pdf import RectangleObject

# Open up the pdf reader
my_pdf_reader = PdfFileReader(open(my_pdf_path,'rb'))

# Designate the output file
writer = PdfFileWriter()
PdfFileWriter.appendPagesFromReader(writer, my_pdf_reader)

# We'll use this function to map bookmarks to their destination page
def _setup_page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):
    if _result is None:
        _result = {}
    if pages is None:
        _num_pages = []
        pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()
    t = pages["/Type"]
    if t == "/Pages":
        for page in pages["/Kids"]:
            _result[page.idnum] = len(_num_pages)
            _setup_page_id_to_num(pdf, page.getObject(), _result, _num_pages)
    elif t == "/Page":
        _num_pages.append(1)
    return _result

# This bit makes a dictionary of bookmark names to page numbers
bookmark_list = []
dest_map = _setup_page_id_to_num(my_pdf_reader)
outline = my_pdf_reader.getOutlines()
for bookmark in outline:
    pg_num = dest_map[bookmark.page.idnum] + 1
    bookmark_list.append([bookmark.title, pg_num])

#Now we're going to go through every text box and if it has a matching bookmark, we'll make it a link
for my_text_box in my_text_boxes:
    link_dest = 0
    for bookmark in bookmark_list:
        if bookmark[0] in my_text_box[2]:
            link_dest = bookmark[1]
    if link_dest > 0:
        rect = RectangleObject(my_text_box[1]) 
        print[my_text_box[0], link_dest-1, rect]
        writer.addLink(my_text_box[0], link_dest-1, rect)

#And finally, write the results out to a new file  
with open(r'/Users/shannon/Desktop/DeltaEcology_withlinks.pdf', 'wb') as new_pdf:
    writer.write(new_pdf)


[[u'Page 1', 1], [u'Page 2', 2], [u'Page 3', 3], [u'Page 4', 4], [u'Page 5', 5]]
[0, 2, RectangleObject([439.44471, 191.36203, 484.38355, 231.10286])]
[0, 1, RectangleObject([169.44182, 191.36203, 214.38066, 231.10286])]
[1, 0, RectangleObject([397.44426, 737.35139, 442.3831, 777.09222])]
[1, 2, RectangleObject([530.88569, 464.23671, 575.82453, 503.97754])]
[1, 3, RectangleObject([337.68362, 191.84202, 382.62246, 231.58285])]
[2, 0, RectangleObject([127.44137, 737.35139, 172.38021, 777.09222])]
[2, 1, RectangleObject([36.00039, 464.23671, 80.93923, 503.97754])]
[2, 3, RectangleObject([67.68073, 33.55718, 234.05469, 231.58285])]
[2, 4, RectangleObject([337.68362, 191.84202, 382.62246, 231.58285])]
[3, 1, RectangleObject([229.20246, 736.8714, 274.1413, 776.61223])]
[3, 2, RectangleObject([499.20535, 736.8714, 544.14419, 776.61223])]
[3, 4, RectangleObject([530.88569, 464.23671, 575.82453, 503.97754])]
[4, 2, RectangleObject([229.20246, 736.8714, 274.1413, 776.61223])]
[4, 3, RectangleObj