Skip to content

Commit

Permalink
Merge pull request #94 from a0js/pdfreader-and-bamboohr-importer
Browse files Browse the repository at this point in the history
pdfreader and bamboohr paycheck importer
  • Loading branch information
redstreet committed Apr 19, 2024
2 parents fbba9a0 + ebbcfeb commit 230f755
Show file tree
Hide file tree
Showing 12 changed files with 397 additions and 6 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ coverage.xml
*.py,cover
.hypothesis/
.pytest_cache/
.debug-*

# Translations
*.mo
Expand Down
64 changes: 64 additions & 0 deletions beancount_reds_importers/importers/bamboohr/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
"""BambooHR paycheck importer"""

import re

from dateparser.search import search_dates

from beancount_reds_importers.libreader import pdfreader
from beancount_reds_importers.libtransactionbuilder import paycheck

# BambooHR exports paycheck stubs to pdf, with multiple tables across multiple pages.
# Call this importer with a config that looks like:
#
# bamboohr.Importer({"desc":"Paycheck (My Company)",
# "main_account":"Income:Employment",
# "paycheck_template": {}, # See beancount_reds_importers/libtransactionbuilder/paycheck.py for sample template
# "currency": "PENNIES",
# }),
#


class Importer(paycheck.Importer, pdfreader.Importer):
IMPORTER_NAME = "BambooHR Paycheck"

def custom_init(self):
self.max_rounding_error = 0.04
self.filename_pattern_def = r"PayStub.*\.pdf"
self.pdf_table_extraction_settings = {"join_tolerance": 4, "snap_tolerance": 4}
self.pdf_table_extraction_crop = (0, 40, 0, 0)
self.debug = False

self.header_map = {
"Deduction Type": "description",
"Pay Type": "description",
"Paycheck Total": "amount",
"Tax Type": "description",
}

self.currency_fields = ["ytd_total", "amount"]

def paycheck_date(self, input_file):
if not self.file_read_done:
self.read_file(input_file)
dates = [date for _, date in search_dates(self.meta_text)]
return dates[2].date()

def prepare_tables(self):
def valid_header(label):
if label in self.header_map:
return self.header_map[header]

label = label.lower().replace(" ", "_")
return re.sub(r"20\d{2}", "ytd", label)

for section, table in self.alltables.items():
# rename columns
for header in table.header():
table = table.rename(header, valid_header(header))
# convert columns
table = self.convert_columns(table)

self.alltables[section] = table

def build_metadata(self, file, metatype=None, data={}):
return {"filing_account": self.config["main_account"]}
72 changes: 72 additions & 0 deletions beancount_reds_importers/importers/genericpdf/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""Generic pdf paycheck importer"""

import datetime

from beancount_reds_importers.libreader import pdfreader
from beancount_reds_importers.libtransactionbuilder import paycheck

# Generic pdf paystub importer. Use this to build your own pdf paystub importer.
# Call this importer with a config that looks like:
#
# genericpdf.Importer({"desc":"Paycheck (My Company)",
# "main_account":"Income:Employment",
# "paycheck_template": {}, # See beancount_reds_importers/libtransactionbuilder/paycheck.py for sample template
# "currency": "PENNIES",
# }),
#


class Importer(paycheck.Importer, pdfreader.Importer):
IMPORTER_NAME = "Generic PDF Paycheck"

def custom_init(self):
self.max_rounding_error = 0.04
self.filename_pattern_def = r"paystub.*\.pdf"
self.pdf_table_extraction_settings = {"join_tolerance": 4, "snap_tolerance": 4}
self.pdf_table_extraction_crop = (0, 0, 0, 0)
self.pdf_table_title_height = 0
# Set this true as you play with the extraction settings and crop to view images of what the pdf parser detects
self.debug = True

self.header_map = {
"CURRENT": "amount",
"CURRENT PAY": "amount",
"PAY DESCRIPTION": "description",
"DEDUCTIONS": "description",
"TAX TYPE": "description",
"TOTAL NET PAY": "description",
"YTD": "ytd",
"YTD PAY": "ytd",
}

self.currency_fields = ["ytd", "amount"]
self.date_format = "%m/%d/%Y"

def paycheck_date(self, input_file):
if not self.file_read_done:
self.read_file(input_file)
*_, d = self.alltables["table_1"].header()
self.date = datetime.datetime.strptime(d, self.date_format)
return self.date.date()

def prepare_tables(self):
def valid_header(label):
if label in self.header_map:
return self.header_map[header]

return label.lower().replace(" ", "_")

for section, table in self.alltables.items():
# rename columns
for header in table.header():
if section == "table_6" and header == "":
table = table.rename(header, "amount")
else:
table = table.rename(header, valid_header(header))
# convert columns
table = self.convert_columns(table)

self.alltables[section] = table

def build_metadata(self, file, metatype=None, data={}):
return {"filing_account": self.config["main_account"]}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from os import path

from beancount.ingest import regression_pytest as regtest

from beancount_reds_importers.importers import genericpdf


@regtest.with_importer(
genericpdf.Importer(
{
"desc": "Paycheck",
"main_account": "Income:Salary:FakeCompany",
"paycheck_template": {
"table_4": {
"Bonus": "Income:Bonus:FakeCompany",
"Overtime": "Income:Overtime:FakeCompany",
"Regular": "Income:Salary:FakeCompany",
},
"table_5": {
"Federal MED/EE": "Expenses:Taxes:Medicare",
"Federal OASDI/EE": "Expenses:Taxes:SocialSecurity",
"Federal Withholding": "Expenses:Taxes:FederalIncome",
"State Withholding": "Expenses:Taxes:StateIncome",
},
"table_6": {"CURRENT": "Assets:Checking:ABCBank"},
},
"currency": "USD",
}
)
)
@regtest.with_testdir(path.dirname(__file__))
class TestGenericPDF(regtest.ImporterTestBase):
pass
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

2023-12-03 * "Paycheck"
filing_account: "Income:Salary:FakeCompany"
Assets:Checking:ABCBank 4228.00 USD
Expenses:Taxes:FederalIncome 416.00 USD
Expenses:Taxes:Medicare 128.00 USD
Expenses:Taxes:SocialSecurity 96.00 USD
Expenses:Taxes:StateIncome 32.00 USD
Income:Bonus:FakeCompany -3000.00 USD
Income:Overtime:FakeCompany -300.00 USD
Income:Salary:FakeCompany -1600.00 USD
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Income:Salary:FakeCompany
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2023-12-03
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
paystub.sample.pdf

0 comments on commit 230f755

Please sign in to comment.