# PDF to DataFrame
Read all of the password-protected PDF paystubs in a directory and parse all of the data into a pandas DataFrame for later use

In [None]:
from pypdf import PdfReader
from datetime import datetime, date, timedelta
from io import StringIO
import re
import pandas as pd
print("Enter the PDF password")
password = input()

In [None]:
filename = "example4.pdf"
reader = PdfReader(filename, password=password)
page = reader.pages[0]

data = dict()
data["fwf"] = []
def visitor_body(text, cm, tm, fontDict, fontSize):
    if (len(text) == 0):
        return
    
    # Extract metadata from the PDF
    if ("Arial" in fontDict["/BaseFont"]):
        # Get the period start and end
        # If the text starts with 'Period :' remove this part, split at 'to' and 'trim' both pieces then convert to date
        if (text.startswith("Period :")):
            period = text.removeprefix("Period :").strip().split(" to ")
            data["period_start"] = datetime.strptime(period[0], "%b %d, %Y").date()
            data["period_end"] = datetime.strptime(period[1].removesuffix("Direct Deposit Payment Advice"), "%b %d, %Y").date()
        
        # Get the name the paystub belongs to
        if (tm[4] == 88.7 and round(tm[5]) == -72):
            data["name"] = text
            
        # Get the payment date and id
        if (514 <= round(tm[4]) <= 515 and round(tm[5]) == -281):
            tokens = text.strip().split(" Payment Date : ")
            data["payment_date"] = datetime.strptime(tokens[0], "%b %d, %Y").date()
            data["payment_id"] = tokens[1]
            
    # Extract hours and monetary data from the PDF, removing the periods that are in the rows
    elif ("CourierNew,Bold" in fontDict["/BaseFont"]):
        pattern = "(?<!\d)\.(?!\d)"
        cleaned = re.sub(pattern, " ", text)
        data["fwf"].append(cleaned)
    else:
        return
    
'''
 This function extracts a block of text, ostensibly a table, from a list
 of strings that are assumed to be the rows in a text file. The text is 
 returned as the first element of a tuple. The second and third elements
 are the column index of the right-most position of the table and the row
 index of bottom-most position of the table respectively. These indices
 can be used to reposition the next search.

Args:
    data (list of strings): the data from which to extract the block
    first_row_index (int): The first row to use. This row will be searched for anchors
    left_anchor (string): The index of the first occurance of this string will be used
        as the left boundary 
    right_anchor (string): The index of the last character of the first occurance of 
        this string will be used as the right boundary
    left_offset (int): if provided, the first row will only be searched after this index
'''
def get_table_in_fwf(data, first_row_index, left_anchor, right_anchor, left_offset=0, skip_rows = []):
    l_index = data[first_row_index][left_offset:].index(left_anchor) + left_offset
    r_index = data[first_row_index][left_offset:].index(right_anchor) + left_offset + len(right_anchor)
    
    # Offset the skip_rows list by adding the first_row_index to each one
    skip_rows = [x + first_row_index for x in skip_rows]
        
    # Skip rows
    rows = [item for i, item in enumerate(data) if i not in skip_rows]
    
    # All rows to be skipped have now been eliminated
    # Now perform the scan top to bottom between the left and right indices
    # To know where to stop, we need to keep until the first row where the 
    # character at r_index is NOT a digit. We start at 2 since the first row
    # is headers and the second is a horizonal line
    num_rows = 1
    # Count the rows while the next row is long enough to have a value at r_index and that value is numeric
    while(len(rows[first_row_index + num_rows]) > r_index-1 and rows[first_row_index + num_rows][r_index-1].isnumeric()):
        num_rows += 1

    table = [c[l_index:r_index] for c in rows[first_row_index:first_row_index + num_rows]]
    #items = [item for i, item in enumerate(table) if i not in skip_rows] #replace this with removing "----..." lines
    return ("\n".join(table), r_index, first_row_index + num_rows + len(skip_rows))
    
page.extract_text(visitor_text=visitor_body)
print(data["name"])
print(data["payment_id"])
print(f'Date: {data["payment_date"]}')
print(f'From: {data["period_start"]}')
print(f'To:   {data["period_end"]}')

# There are 5 tables in data["fwf"]. We need to extract each 
# The first table is the revenue table, extending from the first position 
# of "Paycode" to the last position of the first occurence of "YearToDate"
# This is entirely in the first row (index 0)
(t1a, r_bound, b_bound_1) = get_table_in_fwf(data["fwf"], 0, "Paycode", "YearToDate", skip_rows=[1])
t1 = pd.read_fwf(StringIO(t1a))
print(t1)

# The second table is the Deductions table
(t2a, r_bound, b_bound_2) = get_table_in_fwf(data["fwf"], 0, "Paycode", "YearToDate", r_bound, skip_rows=[1,2])
t2 = pd.read_fwf(StringIO(t2a))
print(t2)

# The third table is the TimeOff table
(t3a, r_bound, b_bound_3) = get_table_in_fwf(data["fwf"], b_bound_1, "Accrual", "Balance", skip_rows=[1])
t3 = pd.read_fwf(StringIO(t3a))
print(t3)

# The fourth table is the Government table
(t4a, r_bound, b_bound_4) = get_table_in_fwf(data["fwf"], b_bound_3, "Government", "YearToDate", skip_rows=[1,6])
t4 = pd.read_fwf(StringIO(t4a))
print(t4)

# The fifth table is the Totals table
(t5a, r_bound, b_bound_5) = get_table_in_fwf(data["fwf"], b_bound_3, "Cheque", "YearToDate", r_bound, skip_rows=[1,6])
t5 = pd.read_fwf(StringIO(t5a))
print(t5)

# We now generate a single row from the 5 pandas DataFrames.
# This will complete the function that gets applied to each PDF.
# Each of these rows will join a large DataFrame with all of the data for all time

In [None]:
# This generator yields the set of Saturdays which are the ends 
# of pay periods in the year provided in the argument
def pay_periods(year):
    epoch = date(2020, 12, 26)
    
    # Get the first saturday in the year
    d = date(year, 1, 1)                    # January 1st
    d += timedelta(days = 5 - d.weekday())  # First Saturday
    if( (d - epoch).days % 14 != 0):
        d += timedelta(days=7)
    while d.year == year:
        yield d
        d += timedelta(days=14)