## Research Project 1
---
```text
- Source: PCAOB
- Goal: Information Extraction from PDF files
- Techniques: Regular Expressions, Probabilistic Context Free Grammar
- Tools: pdftotext, re, duckling
- Lines of code: ~100```

In [None]:
# Standard library
import os
import re
import json
from collections import Counter
from pprint import PrettyPrinter

# Third-party
import requests
from dateutil import parser
from duckling import DucklingWrapper

PPRINTER = PrettyPrinter()
DUCK_PARSER = DucklingWrapper()

def get_text_from_pdf(filename):
    url = 'https://pcaobus.org/Inspections/Reports/Documents/%s.pdf' % filename
    res = requests.get(url)
    with open('./%s.pdf' % filename, 'wb') as f:
        f.write(res.content)
    os.system('pdftotext %s.pdf %s.txt' % (filename, filename)) # If format is important, use "pdftohtml"
    return open('./%s.txt' % filename, 'r').read()

def get_field_from_regex(text, pattern, flags=0, most_common=True):
    matches = re.findall(pattern, text, flags=flags)
    clean = [' '.join(i.strip().split()) for i in matches]
    if most_common:
        counts = Counter(clean)
        final = counts.most_common()[0][0]
    else:
        final = clean
    return final

def get_dates(text):
    parsed = DUCK_PARSER.parse_time(text)
    values = [i['value']['value'] for i in parsed if 'grain' in i['value'] and 
              i['value']['grain'] == 'day']
    final = sorted([parser.parse(i).replace(tzinfo=None) for i in values])
    return final

def get_period_of_inspection(text):
    par = get_field_from_regex(text, 
                               r'INSPECTION PROCEDURES AND CERTAIN OBSERVATIONS(.*?)\n\n', 
                               flags=re.DOTALL)
    parsed = DUCK_PARSER.parse_time(par)
    values = [i for i in parsed if isinstance(i['value']['value'], dict) and
              'to' in i['value']['value']]
    if not values:
        final = get_dates(par)
    else:
        final = [parser.parse(values[0]['value']['value']['from']).replace(tzinfo=None), 
                 parser.parse(values[0]['value']['value']['to']).replace(tzinfo=None)]
    return final

def parse_text(text):
    return {
    
        'PCAOB Release No': get_field_from_regex(text=text,
                                                 pattern=r'PCAOB Release No. (\d{3}-\d{4}-\d{3})',
                                                 flags=re.IGNORECASE),

        'Firm': get_field_from_regex(text=text,
                                     pattern=r'Inspection of (.*?)\n'),

        'Offices': get_field_from_regex(text=text,
                                        pattern=r'Number of offices(.*?)Ownership',
                                        flags=re.DOTALL),

        'Ownership structure': get_field_from_regex(text=text,
                                                    pattern=r'Ownership structure(.*?)Number of partners',
                                                    flags=re.DOTALL),

        'Date of Inspection Report': get_dates(text=text.split('\x0c')[0])[0],

        'Inspection Period': get_period_of_inspection(text=text),

        'Failures': get_field_from_regex(text=text,
                                         pattern=r'\n\(\d\)((?:.*?)failure(?:.*?))(?:\n\n|;)',
                                         flags=re.DOTALL,
                                         most_common=False)
        }

pdfs = [
    '2005_Tamas_B._Revai_CPA',
    '2015_Bravos_Associates'
]

for pdf in pdfs:
    print()
    text = get_text_from_pdf(pdf)
    parsed = parse_text(text)
    PPRINTER.pprint(parsed)