# PDF to DataFrame
Read all of the password-protected PDF paystubs in a directory and parse all of the data into a pandas DataFrame for later use

In [None]:
from pypdf import PdfReader
from datetime import datetime
from io import StringIO
import re
import pandas as pd
print("Enter the PDF password")
password = input()

In [None]:
filename = "example4.pdf"
reader = PdfReader(filename, password=password)
page = reader.pages[0]

data = dict()
data["fwf"] = []
def visitor_body(text, cm, tm, fontDict, fontSize):
    if (len(text) == 0):
        return
    
    # Extract metadata from the PDF
    if ("Arial" in fontDict["/BaseFont"]):
        # Get the period start and end
        # If the text starts with 'Period :' remove this part, split at 'to' and 'trim' both pieces then convert to date
        if (text.startswith("Period :")):
            period = text.removeprefix("Period :").strip().split(" to ")
            data["period_start"] = datetime.strptime(period[0], "%b %d, %Y").date()
            data["period_end"] = datetime.strptime(period[1].removesuffix("Direct Deposit Payment Advice"), "%b %d, %Y").date()
        
        # Get the name the paystub belongs to
        if (tm[4] == 88.7 and round(tm[5]) == -72):
            data["name"] = text
            
        # Get the payment date and id
        if (514 <= round(tm[4]) <= 515 and round(tm[5]) == -281):
            tokens = text.strip().split(" Payment Date : ")
            data["payment_date"] = datetime.strptime(tokens[0], "%b %d, %Y").date()
            data["payment_id"] = tokens[1]
            
    # Extract hours and monetary data from the PDF, removing the periods that are in the rows
    elif ("CourierNew,Bold" in fontDict["/BaseFont"]):
        pattern = "(?<!\d)\.(?!\d)"
        cleaned = re.sub(pattern, " ", text)
        data["fwf"].append(cleaned)
        
        print(text, end="")
    else:
        return
    
'''
# This function extracts a block of text, ostensibly a table, from a list
# of strings that are assumed to be the rows in a text file.

Args:
    data (list of strings): the data from which to extract the block
    first_row_index (int): The first row to use. This row will be searched for anchors
    num_rows (int): The total number of rows to return
    left_anchor (string): The index of the first occurance of this string will be used
        as the left boundary 
    right_anchor (string): The index of the last character of the first occurance of 
        this string will be used as the right boundary
    left_offset (int): if provided, the first row will only be searched after this index
'''
def get_table_in_fwf(data, first_row_index, left_anchor, right_anchor, left_offset=0, skip_rows = [], **kargs):
    if len(kargs.keys() & {'num_rows', 'bottom_anchor'}) != 1:
        raise ValueError('One keyword argument is required: num_rows=int OR bottom_anchor=str')
    if kargs["num_rows"] is not None:
        num_rows = kargs["num_rows"]
    l_index = data[first_row_index][left_offset:].index(left_anchor) + left_offset
    r_index = data[first_row_index][left_offset:].index(right_anchor) + left_offset + len(right_anchor)
    table = [c[l_index:r_index] for c in data[first_row_index:first_row_index + num_rows]]
    items = [item for i, item in enumerate(table) if i not in skip_rows]
    return ("\n".join(items), r_index)
    
page.extract_text(visitor_text=visitor_body)
print(data["name"])
print(data["payment_id"])
print(f'Date: {data["payment_date"]}')
print(f'From: {data["period_start"]}')
print(f'To:   {data["period_end"]}')

# There are 5 tables in data["fwf"]. We need to extract each 
# The first table is the revenue table, extending from the first position 
# of "Paycode" to the last position of the first occurence of "YearToDate"
# This is entirely in the first row (index 0)
(t1a, r_bound) = get_table_in_fwf(data["fwf"], 0, "Paycode", "YearToDate", skip_rows=[1,11], num_rows=12)
t1 = pd.read_fwf(StringIO(t1a))
print(t1)

# The second table is the Deductions table
(t2a, r_bound) = get_table_in_fwf(data["fwf"], 0, "Paycode", "YearToDate", r_bound, skip_rows=[1,2], num_rows=6)
t2 = pd.read_fwf(StringIO(t2a))
print(t2)

# The third table is the TimeOff table
(t3a, r_bound) = get_table_in_fwf(data["fwf"], 12, "Accrual", "Balance", skip_rows=[1], num_rows=6)
t3 = pd.read_fwf(StringIO(t3a))
print(t3)

# The fourth table is the Government table
(t4a, r_bound) = get_table_in_fwf(data["fwf"], 18, "Government", "YearToDate", skip_rows=[1,6], num_rows=8)
t4 = pd.read_fwf(StringIO(t4a))
print(t4)

# The fifth table is the Totals table
(t5a, r_bound) = get_table_in_fwf(data["fwf"], 18, "Cheque", "YearToDate", r_bound, skip_rows=[1,6], num_rows=8)
t5 = pd.read_fwf(StringIO(t5a))
print(t5)

# We now generate a single row from the 5 pandas DataFrames.
# This will complete the function that gets applied to each PDF.
# Each of these rows will join a large DataFrame with all of the data for all time