In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import re
import pandas as pd

# 1. Defining a function to convert PDF to text

In [2]:
def convert_pdf_to_txt(path, pages = None):
    '''
    source: http://stackoverflow.com/a/26495057
    '''
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True

    if not pages:
        pagenos = set()
    else:
        pagenos = set(pages)

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

# 2. Defining path to PDF, pages of interest, extracting raw .txt

In [3]:
PDFpath = "./108849.pdf"
pageofint = [1, 41]
rawtxt = convert_pdf_to_txt(PDFpath, pageofint)

As an example, we are now going to extract several pieces of information 

1) The Legal name

2) The Advisory business name 

3) The list of Direct Owners and Executive Officers (Schedule A) 

4) The list of Indirect Owners (Schedule B)

Extracting 1 and 2 is simple because it is simply a matter of finding a single line of text. 3) and 4) are more complicated because they require extracting text from a table and bringing the extracted text back to this format. 

# 3. Legal and advisory business names

In [4]:
legname = re.search(r'Your\sfull\slegal\sname\s\(if\syou\sare\sa\ssole\sproprietor,'
                    r'\syour\slast,\sfirst,\sand\smiddle\snames\):(.*?)B\.Name\sunder',
                    rawtxt, flags = re.DOTALL).group(1).strip()
adbusname = re.search(r'Name\sunder\swhich\syou\sprimarily\sconduct\syour\sadvisory\sbusiness,'
                      r'\sif\sdifferent\sfrom\sItem\s1\.A\.:(.*?)List\son\sSection',
                      rawtxt, flags = re.DOTALL).group(1).strip()

In [5]:
print(legname)
print(adbusname)

BOGLE INVESTMENT MANAGEMENT L P
BOGLE INVESTMENT MANAGEMENT, L.P.


# 4. Lists of Direct and Indirect Owners
## 4.1 Extracting the pieces of text containing the tables

In [6]:
schedAraw = re.search(r'Complete\seach\scolumn\..*?FULL\sLEGAL.*?or\sEmployer\sID\sNo'
                      r'\.(.*?)Schedule\sBIndirect',
                      rawtxt, flags = re.DOTALL).group(1).strip()
schedBraw = re.search(r'or\sEmployer\sID\sNo\..*(BOGLE\sINV.+?PARTNER.*?)'
                      r'Schedule\sD\s-\sMiscellaneous',
                      rawtxt, flags = re.DOTALL).group(1).strip()

## 4.2 Adding line breaks after last field of each of the two tables and split on line breaks

In [7]:
schedA = re.sub(r'([0-9]{2}-[0-9]{7}|[0-9]{7})', r'\1\n', schedAraw).splitlines()
schedB = re.sub(r'([0-9]{2}-[0-9]{7}|[0-9]{7})', r'\1\n', schedBraw).splitlines()

## 4.3 Extracting each field of each line with a regex

In [8]:
sAtl = []
sBtl = []
for entry in schedA:
    crd_no = re.search(r'(.+[YN])([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(2)
    pr = re.search(r'(.+)([YN])([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(2)
    cont_pers = re.search(r'(.+)([YN])([YN])([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(2)
    own_code = re.search(r'(.+)(NA|A|B|C|D|E)([YN])([YN])'
                         r'([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(2)
    date_saq = re.search(r'(.+)([0-9]{2}/[0-9]{4})(NA|A|B|C|D|E)([YN])([YN])'
                         r'([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(2)
    status = re.search(r'(.+)(DE|FE|I)(LIMITED|GENERAL)([A-Z\s]+)([0-9]{2}/[0-9]{4})'
                       r'(NA|A|B|C|D|E)([YN])([YN])([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(3) + " " + \
             re.search(r'(.+)(DE|FE|I)(LIMITED|GENERAL)([A-Z\s]+)([0-9]{2}/[0-9]{4})'
                       r'(NA|A|B|C|D|E)([YN])([YN])([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(4)
    dfei = re.search(r'(.+)(DE|FE|I)(LIMITED|GENERAL)([A-Z\s]+)([0-9]{2}/[0-9]{4})'
                     r'(NA|A|B|C|D|E)([YN])([YN])([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(2)
    fln = re.search(r'(.+)(DE|FE|I)(LIMITED|GENERAL)([A-Z\s]+)([0-9]{2}/[0-9]{4})'
                    r'(NA|A|B|C|D|E)([YN])([YN])([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(1)
    tup = (fln, dfei, status, date_saq, own_code, cont_pers, pr, crd_no)
    sAtl.append(tup)

for entry in schedB:
    crd_no = re.search(r'(.+[YN])([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(2)
    pr = re.search(r'(.+)([YN])([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(2)
    cont_pers = re.search(r'(.+)([YN])([YN])([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(2)
    own_code = re.search(r'(.+)(NA|A|B|C|D|E|F)([YN])([YN])'
                         r'([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(2)
    date_saq = re.search(r'(.+)([0-9]{2}/[0-9]{4})(NA|A|B|C|D|E|F)([YN])([YN])'
                         r'([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(2)
    status = re.search(r'(.+)(DE|FE|I)(LLC|NA)(SOLE|GENERAL)([A-Z\s]+)([0-9]{2}/[0-9]{4})'
                       r'(NA|A|B|C|D|E|F)([YN])([YN])([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(4) + " " + \
             re.search(r'(.+)(DE|FE|I)(LLC|NA)(SOLE|GENERAL)([A-Z\s]+)([0-9]{2}/[0-9]{4})'
                       r'(NA|A|B|C|D|E|F)([YN])([YN])([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(5)
    ent_int = re.search(r'(.+)(DE|FE|I)(LLC|NA)(SOLE|GENERAL)([A-Z\s]+)([0-9]{2}/[0-9]{4})'
                        r'(NA|A|B|C|D|E|F)([YN])([YN])([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(3)
    dfei = re.search(r'(.+)(DE|FE|I)(LLC|NA)(SOLE|GENERAL)([A-Z\s]+)([0-9]{2}/[0-9]{4})'
                     r'(NA|A|B|C|D|E|F)([YN])([YN])([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(2)
    fln = re.search(r'(.+)(DE|FE|I)(LLC|NA)(SOLE|GENERAL)([A-Z\s]+)([0-9]{2}/[0-9]{4})'
                    r'(NA|A|B|C|D|E|F)([YN])([YN])([0-9]{2}-[0-9]{7}|[0-9]{7})', entry).group(1)
    tup = (fln, dfei, ent_int, status, date_saq, own_code, cont_pers, pr, crd_no)
    sBtl.append(tup)  


## 4.4 Dumping the result into pandas dataframes

In [9]:
schedAnames = ['FULL LEGAL NAME', 'DE/FE/I', 'Status', 'Date Status Acquired',
               'Ownership Code', 'Control Person', 'PR', 'CRD No.']
schedBnames = ['FULL LEGAL NAME', 'DE/FE/I', 'Entity in Which Interest is Owned',
               'Status', 'Date Status Acquired', 'Ownership Code', 'Control Person', 'PR', 'CRD No.']

sAdf =  pd.DataFrame(data = sAtl, columns=schedAnames) 
sBdf =  pd.DataFrame(data = sBtl, columns=schedBnames) 

In [10]:
print(sAdf)
print(sBdf)

                FULL LEGAL NAME DE/FE/I                  Status  \
0           HARTT, KEITH, DAVID       I         LIMITED PARTNER   
1      BOGLE JR., JOHN, CLIFTON       I         LIMITED PARTNER   
2  BARDINELLI, BRITT, STEPHANIE       I  LIMITED PARTNER ANDCCO   
3    BOGLE INVESTMENT GROUP LLC      DE         GENERAL PARTNER   
4       SABBEY, CHRISTOPHER, N.       I         LIMITED PARTNER   
5           LEWIS, JONATHON, D.       I         LIMITED PARTNER   
6              HUMMEL, PAUL, R.       I         LIMITED PARTNER   

  Date Status Acquired Ownership Code Control Person PR     CRD No.  
0              05/1999              B              Y  N     4373231  
1              05/1999              C              Y  N     2662428  
2              09/1999              A              Y  N     2839637  
3              05/1999              A              Y  N  04-3479131  
4              06/2001              B              N  N     5648650  
5              08/2002              B      