In [5]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
import sys
import pandas as pd

In [6]:
#!pip install pdfminer.six

In [9]:
def flatten(l):
    return [item for sublist in l for item in sublist]

def parse_obj(lt_objs):
    res = []
    for obj in lt_objs:
        if isinstance(obj, pdfminer.layout.LTTextLine):
            text = obj.get_text().strip()
            if text != '':
                res.append((int(obj.bbox[0]), int(obj.bbox[1]), text))
        # if it's a container or textbox, recurse
        if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
            res += (parse_obj(obj._objs))
        elif isinstance(obj, pdfminer.layout.LTFigure):
            res += (parse_obj(obj._objs))
    return res

In [11]:
def parsepdf(filename='Data/report3.pdf', startpage=None, endpage=None):
    
    # parse the pdf into a big list of commands of the form (x coord, y coord, text)
    if isinstance(filename, str):
        fp = open(filename, 'rb')
    else:
        fp = filename
    
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable: raise PDFTextExtractionNotAllowed
    
    if startpage is None:
        from pdfminer.pdfinterp import resolve1
        startpage = 0
        endpage = resolve1(document.catalog['Pages'])['Count']
        print("Number of pages: ", endpage)
    
    res = []
    for i, page in enumerate(PDFPage.create_pages(document)):
        if i >= startpage and i <= endpage:
            # read the page into a layout object
            interpreter.process_page(page)
            layout = device.get_result()
            # extract text, and sort it from top to bottom, left to right
            df = pd.DataFrame(parse_obj(layout._objs), columns=['x','y','text'])
            # res.append(df)
            res.append(sort_rows(df))
    return res

In [12]:
def sort_rows(df):
    # cluster the y coordinates
    from sklearn.cluster import DBSCAN
    dbscan = DBSCAN(eps=3, min_samples=2)
    dbscan.fit(df['y'].values.reshape(-1,1))
    df['cluster'] = dbscan.labels_
    
    # now sort
    cluster_mins = df.groupby(['cluster'])['y'].min().reset_index()
    cluster_mins.columns = ['cluster','min_y']
    df = df.merge(cluster_mins)
    df.loc[df.cluster != -1, 'y'] = df.min_y
    df = df.sort_values(['y','x'], ascending=[False,True])
    
    return df.reset_index()


In [14]:
def get_text(filename, startpage=None, endpage=None):
    # turn the parsed dataframe of coordinates and text into a single string
    pages = parsepdf(filename, startpage, endpage)
    def get_one_page(df):
        return '\n'.join(df.groupby('y')['text'].agg(lambda x: '||'.join(x))[::-1])
    # return L(*pages).fmap(get_one_page).intersperse('\n\n\n')
    return '\n\n\n'.join([get_one_page(page) for page in pages])


In [17]:
!/home/jovyan/Old_Docs/test1.pdf

/home/jovyan/Notebooks


In [18]:
get_text('/home/jovyan/Old_Docs/test1.pdf')

Number of pages:  3


'Headline Verdana Bold\nAssurance & Advisory\nUnit Pricing | FS CoE Tie Out process\nSeptember 2018\n\n\nUnit Pricing for FS tie out only\nINTERNAL USE ONLY\nResponsibilities\nDefinitions\nFinancial statements (FS) CoE responsibilities:\nFinancial statements are defined as including:||1. FS Tie Out scope of work includes:\n•||Statement of Financial Position||1.1 Checking mathematical accuracy, consistency of information\n•||Statement of Profit and Loss and Other Comprehensive Income||and accuracy of cross referencing.\n•||Statement of Changes in Equity||1.2 Reconciling the figures disclosed in the FS to\n•||Statement of Cash Flows||the audited Trial Balance (TB).\n•||Summary of significant accounting policies and other notes||1.3 Checking the spelling, formatting and grammar checks.\n•||Pro-forma Directors’ report and Directors’ Declaration||1.4 Agreeing the comparatives to the signed prior year FS.\n2. FS Tie Out scope of work does not include:\nPricing||2.1 Any preparation of the FS 