### Extracting all the text

In [6]:
import io
 
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
 
def extract_text_from_pdf(pdf_path):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
 
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
 
        text = fake_file_handle.getvalue()
 
    # close open handles
    converter.close()
    fake_file_handle.close()
 
    if text:
        return text
 
if __name__ == '__main__':
    print(extract_text_from_pdf('Data/www.trasers.com » Actions - 2019-08-22.pdf'))


www.trasers.com » Actions - 2019-08-22Visitor detailVisits:5Unique ID:2749500497IP address:67.188.228.0 - ARIN  RIPE Locale: Redwood City, California, USA / EnglishOrganization: Comcast Cable / comcast.netPlatform: Google Chrome 75.0    Mac OS X  1440x900  All time goals: Main Register Step 1 (Aug 22 2019) This visit First visitDate:Wed Aug 14 2019, 9:14amYou are currently viewing this person's ﬁrst visit.Session:10m 20s, 3 actionsLanding page: /blog/technologyReferrer: google.com [secure search] SearchesAug 14 20199:14:12 am /blog/technologyTechnology Transformation Blogs | TRASERSAug 14 20199:14:20 am /insightsMarket Research Industry Insights for EnterprisesAug 14 20199:14:32 am /insights/be-proactive-or-perish-the-transformation-tsunami-will-be-merciless-on-laggardsBe Proactive or Perish: The Transformation Tsunami Will Be Merciless on Laggards | Trasers


### Extracting text by page

In [5]:
# miner_text_generator.py
 
import io
 
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
 
def extract_text_by_page(pdf_path):
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)
 
            text = fake_file_handle.getvalue()
            yield text
 
            # close open handles
            converter.close()
            fake_file_handle.close()
 
def extract_text(pdf_path):
    for page in extract_text_by_page(pdf_path):
        print(page)
        print()
 
if __name__ == '__main__':
    print(extract_text('Data/www.trasers.com » Actions - 2019-08-22.pdf'))

www.trasers.com » Actions - 2019-08-22Visitor detailVisits:5Unique ID:2749500497IP address:67.188.228.0 - ARIN  RIPE Locale: Redwood City, California, USA / EnglishOrganization: Comcast Cable / comcast.netPlatform: Google Chrome 75.0    Mac OS X  1440x900  All time goals: Main Register Step 1 (Aug 22 2019) This visit First visitDate:Wed Aug 14 2019, 9:14amYou are currently viewing this person's ﬁrst visit.Session:10m 20s, 3 actionsLanding page: /blog/technologyReferrer: google.com [secure search] SearchesAug 14 20199:14:12 am /blog/technologyTechnology Transformation Blogs | TRASERSAug 14 20199:14:20 am /insightsMarket Research Industry Insights for EnterprisesAug 14 20199:14:32 am /insights/be-proactive-or-perish-the-transformation-tsunami-will-be-merciless-on-laggardsBe Proactive or Perish: The Transformation Tsunami Will Be Merciless on Laggards | Trasers

None
