# Extracting Text from a PDF

### PDFMiner is a good library for extracting text. It works like this...

In [27]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO


def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    pagenos=set()
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp, pagenos, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    print(text)


In [None]:
convert_pdf_to_txt(r"/Users/shannon/Desktop/Crawl.pdf")

### Let's see what happens if we try it with PyPDF2...

In [29]:
from PyPDF2 import PdfFileReader
def convert_pdf_to_txt_pypdf2(path):
    fp = PdfFileReader(open(path, 'rb'))
    for i in range(fp.getNumPages()):
        page = fp.getPage(i)
        page_content = page.extractText()
        print(page_content.encode('utf-8'))

In [30]:
convert_pdf_to_txt_pypdf2(r"/Users/shannon/Desktop/Crawl.pdf")

A long time ago in a galaxy far, far awayÉ
Star 
Wars
Episode V
THE EMPIRE STRIKES BACK
It is a dark time for the Rebellion. 
Although the Death Star has been 
destroyed, Imperial troops have driven 
the Rebel forces from their hidden 
base and pursued them across 
the galaxy.
Evading the dreaded Imperial 
Starßeet, a group of freedom Þghters 
led by Luke Skywalker has established 
a new secret base on the remote ice 
world of 
Hoth.The evil lord Darth Vader, obsessed 
with Þnding young Skywalker, has 
dispatched thousands of remote 
probes into 
the far reaches of space....



### So why would we use PDFMiner at all?
Well, it's a text box thing. PyPDF2 can get inside a text box in a PDF (usually) but if you want to keep the contents of individual text boxes separate or get coordinates of those boxes, PDFMiner is the better option.

In [31]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBox, LTTextLine, LTFigure

# This is a function to dig through the contents and attributes of a text box in the pdf recursively
def parse_layout(layout, pagenum, text_bit):
    """Function to recursively parse the layout tree."""
    for lt_obj in layout:
        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
            if "LTTextBox" in lt_obj.__class__.__name__:
                my_text = lt_obj.get_text()
                if text_bit in my_text:
                    print(my_text)
                    print(lt_obj.bbox)
        elif isinstance(lt_obj, LTFigure):
            parse_layout(lt_obj, pagenum, text_bit)  # in case its a text box inside a text box inside a...
            
# This is a function to locate text boxes in the pdf
def find_a_text_box(path, search_string):
    rsrcmgr = PDFResourceManager()
    codec = 'utf-8'
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    fp = open(path, 'rb')
    parser = PDFParser(fp)
    
    for pagenumber, page in enumerate(PDFPage.create_pages(PDFDocument(parser))):
        interpreter.process_page(page)
        layout = device.get_result()
        parse_layout(layout, pagenumber, search_string)

In [None]:
find_a_text_box(r"/Users/shannon/Desktop/Crawl.pdf", "far reaches of space")