# Electric Defense Weapon Report (EDWR) Scraper

## Import Libraries

In [14]:
import os
import pandas as pd
import numpy as np

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdftypes import resolve1

## Define Constants

In [2]:
# file path containing pdfs
base_path = "test_pdfs/"

In [3]:
my_file = os.path.join(base_path + "/" + "EDWOPM1.pdf")

password = ""

In [52]:
# dictionary for converting stripped postscript literal strings to integer booleans
psLiteralDict = {
    "Off" : 0,
    "Yes" : 1
}

In [80]:
files = []
for (dirpath, dirnames, filenames) in os.walk(base_path):
    files.extend(filenames)
    break
    
print(files)

['EDWOPM5.pdf', 'EDWOPM4.pdf', '.DS_Store', 'EDWOPM1.pdf', 'EDWOPM3.pdf', 'EDWOPM2.pdf']


## Prepare PDF

In [82]:
# open and read the pdf file in binary mode
fp = open(my_file, "rb")

# create parser object to parse the pdf content
parser = PDFParser(fp)

# store the parsed content in PDFDocument object
document = PDFDocument(parser, password)

# check if document is extractable, if not abort
if document.is_extractable:
    print('Document is well.')
else:
    raise PDFTextExtractionNotAllowed

Document is well.


In [83]:
type(document)

pdfminer.pdfdocument.PDFDocument

## Resolve PDF Document

In [81]:
fields = resolve1(document.catalog['AcroForm'])['Fields']
# for i in fields:
#     field = resolve1(i)
#     name, value = field.get('T'), field.get('V')
#     print('{0}: {1}'.format(name, value))

type(fields)

list

## Construct Empty Dataframe with Inherited Column Names

In [17]:
def getColumnNames(fields):
    """
    Given a resolved list of fields from PDF document, create an empty dataframe with field column names.
    
    :param fields: <list> The original fields list for a given PDF
    """

    global field_list
    field_list = []

    for i in fields:
        field = resolve1(i)
        field_name = field.get('T').decode("utf-8") # the fields gotten from the resolve is a bit string
        field_list.append(field_name)

    df = pd.DataFrame(columns=field_list)

In [75]:
def getValuesFromFields(fields):
    """
    Given a resolved list of fields from PDF document, create a list of the field values.
    
    :param fields: <list> The original fields list for a given PDF
    """
    
    global values
    values = []
    
    for i in fields:
        
        field = resolve1(i)
        
        field_value = field.get('V')
    
        try:
        
            field_value = field_value.decode("utf-8")
    
        except:
        
            field_value = str(field_value).strip("\/") # strip forward slash from strings
        
            field_value = str(field_value).strip("\'") # strip backward slash from strings
        
            field_value = psLiteralDict.get(field_value) # convert string to integer boolean
        
        values.append(field_value)

In [71]:
def addRow(df,ls):
    """
    Given a dataframe and a list, append the list as a new row to the dataframe.

    :param df: <DataFrame> The original dataframe
    :param ls: <list> The new row to be added
    :return: <DataFrame> The dataframe with the newly appended row
    
    https://stackoverflow.com/questions/26309962/appending-a-list-or-series-to-a-pandas-dataframe-as-a-row
    """

    numEl = len(ls)

    newRow = pd.DataFrame(np.array(ls).reshape(1,numEl), columns = list(df.columns))

    df = df.append(newRow, ignore_index=True)

    return df

In [72]:
getValuesFromFields(fields)

In [73]:
print(values)

['New Haven', 'Buck', '2018', '12/20', '2283787108782', '12/20', '2/01/2018', '3p', '5\'3"', '180', '25', 'M', 'White', 1, 0, 0, 0, 1, 1, 0, 1, '3s', '10s', '', 'Chris', 0, 0, 0, 1, 0, 0, 0, 0, 0, '', 1, 0, 0, 0, 0, 0, 0, '', 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, '', 0, 0, 0, 1, 1, 0, 0, 0, '', 0, 0, 0, 0, 1, 0, 'knife', '', 0, 1, 0, 0, 0, 0, 0, 0, 0, 'spoon', 1, 0, 0, '']


In [74]:
addRow(df, values)

Unnamed: 0,Law Enforcement Agency,Report Prepared By,Reporting Year,Date of Report,Incident Case Number,Date of Report_2,Date of Incident,Time of Incident,Height,Weight,...,Fleeing,Unarmed Assault,Armed with Firearm,Armed with Edged Weapon,Armed with Blunt Instrument,Armed with Other,Armed with Other 2,Failed to Follow Officer Directions,Suicidal,Subject Resistance Other
0,New Haven,Buck,2018,12/20,2283787108782,12/20,2/01/2018,3p,"5'3""",180,...,0,0,0,0,0,spoon,1,0,0,
