In [1]:
import pdftotext
import os
import re
import pandas as pd
import gspread

# East West Bank

In [2]:
# EDGE CASE: MULTI-PAGE STATEMENTS
# EDGE CASE: MULTI-LINE TRANSACTIONS
# EDGE CASE: CHECK DEPOSITS BREAK PYPDF

In [3]:
def pair_sequentially(numbers):
    # Check if the list has an even length
    if len(numbers) % 2 != 0:
        raise ValueError("The list must have an even length")

    # Create the list of pairs
    paired_list = []
    for i in range(0, len(numbers), 2):
        paired_list.append(numbers[i:i+2])
    
    return paired_list

def remove_indices(lst, indices):
    # Sort indices in descending order to avoid reindexing issues
    indices = sorted(indices, reverse=True)
    
    # Remove elements at each index
    for index in indices:
        if 0 <= index < len(lst):
            lst.pop(index)
        else:
            raise IndexError(f"Index {index} is out of bounds for list of length {len(lst)}")
    
    return lst

def has_consecutive_values(data):
  """
  This function checks if a list contains any consecutive values (adjacent duplicates).

  Args:
      data: A list of any data type.

  Returns:
      True if there are consecutive values, False otherwise.
  """
  if len(data) <= 1:
    return False  # Need at least 2 elements for consecutive values

  # Iterate through the list, checking for adjacent duplicates
  for i in range(1, len(data)):
    if data[i] == data[i-1] + 1:
      return True
  return False

def concatenate_multi_line_transactions(transaction_lines_list, bank, year, month):
    """
    Edit credit and debit line lists to concatenate strings from multi-line items.
    """
    transaction_lines_list = transaction_lines_list.copy()
    date_pattern = r"^\d{2}-\d{2}$"
    bad_idx_list = []
    for idx, txc_line in enumerate(transaction_lines_list):
        if not bool(re.match(date_pattern, txc_line[:5])):
            bad_idx_list.append(idx)

    if has_consecutive_values(data=bad_idx_list):
        raise Exception(f'Transactions with more than two lines found. Info: {bank}, {year}, {month}')

    if len(bad_idx_list) > 0:
        complete_bad_idx_list = []
        for i in bad_idx_list:
            complete_bad_idx_list.append(i - 1)
            complete_bad_idx_list.append(i)

        paired_short_line_list = pair_sequentially(complete_bad_idx_list)

        # extract them from the original list and concatenate them together
        combined_line_list = []
        for paired_lines in paired_short_line_list:
            combined_line = ''.join(transaction_lines_list[paired_lines[0]:paired_lines[1]+1])
            combined_line_list.append(combined_line)

        # drop the original items
        transaction_lines_list = remove_indices(transaction_lines_list, complete_bad_idx_list)

        return transaction_lines_list + combined_line_list
    else:
        return transaction_lines_list

def parse_lines_with_regex(lines, transaction_pattern):
    # Ref.: https://levelup.gitconnected.com/creating-a-bank-statement-parser-with-python-9223b895ebae
    transactions = []
    for line in lines:
        match = re.search(pattern=transaction_pattern, string=line)
        if match:
            transactions.append(match.groupdict())

    return pd.DataFrame(transactions)

transaction_pattern = (
        r"(?P<transaction_date>\d+-\d+)\s*"
        r"(?P<description>.*?)\s*"
        r"(?P<amount>[\d.,]+)$"
)

def parse_east_west_bank_bank_statements_by_year(bank, year, bank_statements_file_path):
    """
    Go through a year of monthly bank statements for a given bank and parse
    the statements and return a df.
    """
    file_path = os.path.join(bank_statements_file_path, bank, year)
    monthly_bank_statement_list = os.listdir(file_path)
    monthly_bank_statement_list = [i for i in monthly_bank_statement_list if i != '.DS_Store']
    transactions_df_list = []

    for monthly_bank_statement_file in monthly_bank_statement_list:
        month = monthly_bank_statement_file.split('_')[1].split('.')[0]
        with open(os.path.join(file_path, monthly_bank_statement_file), "rb") as file:
            pdf = pdftotext.PDF(file, physical=True)
            if len(pdf) == 2:
                first_page = pdf[0]
                lines1 = first_page.split("\n")
                lines1 = [i.lstrip() for i in lines1]
                lines1 = [i for i in lines1 if i != '']
                lines2 = None
            elif len(pdf) == 3:
                first_page = pdf[0]
                second_page = pdf[1]
                lines1 = first_page.split("\n")
                second_page = pdf[1]
                lines2 = second_page.split("\n")

                lines1 = [i.lstrip() for i in lines1]
                lines1 = [i for i in lines1 if i != '']
                lines2 = [i.lstrip() for i in lines2]
                lines2 = [i for i in lines2 if i != '']
            else:
                pdf_length = len(pdf)
                raise Exception(f'New length for pdf ({pdf_length}), time to set new rules. Info: {bank}, {year}, {month}')
            
            # for line in lines1:
            #     print(line.lstrip())

        try:
            credits_line_idx = lines1.index('CREDITS')
            credit_balance_exists = True
        except:
            credit_balance_exists = False
        try:
            debits_line_idx = lines1.index('DEBITS')
            debits_line_on_first_page = True
        except:
            debits_line_on_first_page = False
            raise Exception(f'DEBITS not on first page, time to set new rules. Info: {bank}, {year}, {month}')

        try:
            daily_balances_line_idx = lines1.index('DAILY BALANCES')
            daily_balances_on_first_page = True
        except:
            daily_balances_line_idx = lines2.index('DAILY BALANCES')
            daily_balances_on_first_page = False
        credit_lines = lines1[credits_line_idx + 2:debits_line_idx]

        try: # check if the DEBITS balance is multi-page
            if len([i for i in lines2 if i[:4] == 'Date']) > 1:
                debit_balance_multi_page = True
            else:
                debit_balance_multi_page = False
        except:
            debit_balance_multi_page = False

        # No CREDITS and DEBITS does not extend to second
        if not credit_balance_exists and not debit_balance_multi_page:
            debit_lines = lines1[debits_line_idx + 2:daily_balances_line_idx]
            debit_lines = concatenate_multi_line_transactions(debit_lines, bank, year, month)
            debit_df = parse_lines_with_regex(lines=debit_lines, transaction_pattern=transaction_pattern)
            debit_df['amount'] = debit_df['amount'].apply(lambda x: -1 * float(x.replace(',' , '')))

            debit_df['bank'] = bank
            debit_df['year'] = year
            debit_df['month'] = month
            transactions_df_list.append(debit_df)
            continue
        # DAILY BALANCES on first page and DEBITS does not extend to second
        elif daily_balances_on_first_page and not debit_balance_multi_page:
            debit_lines = lines1[debits_line_idx + 2:daily_balances_line_idx]
        # DEBITS on first page but extends to second
        elif debits_line_on_first_page and debit_balance_multi_page:
            second_page_debit_date_line_idx = lines2.index([i for i in lines2 if i[:4] == 'Date'][0])
            debit_lines = lines1[debits_line_idx + 2:] + lines2[second_page_debit_date_line_idx + 1:daily_balances_line_idx]
        # DAILY BALANCES not on first page and DEBITS does not extend to second
        elif not daily_balances_on_first_page and not debit_balance_multi_page:
            debit_lines = lines1[debits_line_idx + 2:]
        else:
            raise Exception(f'Uncaught case. Info: {bank}, {year}, {month}')

        credit_lines = concatenate_multi_line_transactions(credit_lines, bank, year, month)
        debit_lines = concatenate_multi_line_transactions(debit_lines, bank, year, month)

        credit_df = parse_lines_with_regex(lines=credit_lines, transaction_pattern=transaction_pattern)
        debit_df = parse_lines_with_regex(lines=debit_lines, transaction_pattern=transaction_pattern)

        credit_df['amount'] = credit_df['amount'].apply(lambda x: float(x.replace(',' , '')))
        debit_df['amount'] = debit_df['amount'].apply(lambda x: -1 * float(x.replace(',' , '')))

        transactions_df = pd.concat([credit_df, debit_df])

        transactions_df['bank'] = bank
        transactions_df['year'] = year
        transactions_df['month'] = month
        transactions_df_list.append(transactions_df)

    combined_transactions_df = pd.concat(transactions_df_list)

    return combined_transactions_df

In [4]:
east_west_bank_annual_beginning_ending_balance_df = pd.DataFrame({
    '2020': [2459.25, 15619.65],
    '2021': [15619.65, 14552.04],
    '2022': [14552.04, 3046.51],
    '2023': [3046.51, 19634.88],
    '2024': [19634.88, 2982.74],
})

# transactions_2019_df = parse_bank_statements_by_year(
#     bank='east_west_bank',
#     year='2019',
#     bank_statements_file_path='../data/bank_statements',
# )

transactions_2020_df = parse_east_west_bank_bank_statements_by_year(
    bank='east_west_bank',
    year='2020',
    bank_statements_file_path='../data/bank_statements',
)
transactions_2021_df = parse_east_west_bank_bank_statements_by_year(
    bank='east_west_bank',
    year='2021',
    bank_statements_file_path='../data/bank_statements',
)
transactions_2022_df = parse_east_west_bank_bank_statements_by_year(
    bank='east_west_bank',
    year='2022',
    bank_statements_file_path='../data/bank_statements',
)
transactions_2023_df = parse_east_west_bank_bank_statements_by_year(
    bank='east_west_bank',
    year='2023',
    bank_statements_file_path='../data/bank_statements',
)
transactions_2024_df = parse_east_west_bank_bank_statements_by_year(
    bank='east_west_bank',
    year='2024',
    bank_statements_file_path='../data/bank_statements',
)

def approx_sum(x):
    return round(sum(x), 2)

def test_annual_balance_east_west_bank(df, year, ref_df):
    tmp = round(ref_df[year][0] + approx_sum(df['amount']), 2)
    assert tmp == ref_df[year][1]

# test for regressions
test_annual_balance_east_west_bank(
    df=transactions_2020_df,
    year='2020',
    ref_df=east_west_bank_annual_beginning_ending_balance_df
)
test_annual_balance_east_west_bank(
    df=transactions_2021_df,
    year='2021',
    ref_df=east_west_bank_annual_beginning_ending_balance_df
)
test_annual_balance_east_west_bank(
    df=transactions_2022_df,
    year='2022',
    ref_df=east_west_bank_annual_beginning_ending_balance_df
)
test_annual_balance_east_west_bank(
    df=transactions_2023_df,
    year='2023',
    ref_df=east_west_bank_annual_beginning_ending_balance_df
)
test_annual_balance_east_west_bank(
    df=transactions_2024_df,
    year='2024',
    ref_df=east_west_bank_annual_beginning_ending_balance_df
)

total_transactions_df = pd.concat([
    transactions_2020_df,
    transactions_2021_df,
    transactions_2022_df,
    transactions_2023_df,
    transactions_2024_df
])

total_transactions_df['transaction_date_month'] = total_transactions_df['transaction_date'].apply(lambda x: x.split('-')[0])
assert sum(total_transactions_df['transaction_date_month'] != total_transactions_df['month']) == 0
total_transactions_df['transaction_date'] = total_transactions_df['year'] + '-' + total_transactions_df['transaction_date']
total_transactions_df.drop(['year', 'month', 'transaction_date_month'], inplace=True, axis=1)
total_transactions_df.sort_values(by=['transaction_date', 'amount', 'description'], ascending=[True, False, True], inplace=True)
total_transactions_df.reset_index(drop=True, inplace=True)

In [14]:
gc = gspread.service_account()

finance_tracker_db_spreadsheet = gc.open_by_key('1CAyyf2kr-pS7LNX1a_0ithw6niL3Js3K4ZEOlwDViZY')
east_west_bank_worksheet = finance_tracker_db_spreadsheet.worksheet('east_west_bank_bank_statements')
east_west_bank_worksheet.update([total_transactions_df.columns.values.tolist()] + total_transactions_df.values.tolist())
east_west_bank_worksheet.format("C:C", {"numberFormat": {"type": "CURRENCY"}})
east_west_bank_worksheet.format("A", {"numberFormat": {"type": "DATE_TIME"}})

{'spreadsheetId': '1CAyyf2kr-pS7LNX1a_0ithw6niL3Js3K4ZEOlwDViZY',
 'replies': [{}]}

In [8]:
# build some unit tests
# create truth table, check for regressions
# rules:
# pdf generally has length +1, ignore last blank page
# CREDITS should be in first page
# DEBITS should be in first page (could be in second, haven't seen it)
# DAILY BALANCES could be in first or second
# some transactions are multi-line
# capture the transactions under CREDITS and DEBITS correctly

# get the beginning and ending balance per month/year in a reference table

# Marcus

In [9]:
def parse_lines_with_regex(lines, transaction_pattern):
    # Ref.: https://levelup.gitconnected.com/creating-a-bank-statement-parser-with-python-9223b895ebae
    transactions = []
    for line in lines:
        match = re.search(pattern=transaction_pattern, string=line)
        if match:
            transactions.append(match.groupdict())

    return pd.DataFrame(transactions)

transaction_pattern = (
    r"(?P<transaction_date>\d+/\d+/\d+)\s*"
    r"(?P<description>.*?)(?=\$)"
    r"(?P<credit_debit>.*?)(?=\s)\s*"
    r"(?P<balance>.*)"
)

def currency_to_float(x):
    return float(x.replace('$', '').replace(',', ''))

def parse_marcus_bank_statements_by_year(bank, year, bank_statements_file_path):
    """
    Go through a year of monthly bank statements for a given bank and parse
    the statements and return a df.
    """
    file_path = os.path.join(bank_statements_file_path, bank, year)
    monthly_bank_statement_list = os.listdir(file_path)
    monthly_bank_statement_list = [i for i in monthly_bank_statement_list if i != '.DS_Store']
    transactions_df_list = []
    for monthly_bank_statement_file in monthly_bank_statement_list:
        month = monthly_bank_statement_file.split('_')[1].split('.')[0]
        with open(os.path.join(file_path, monthly_bank_statement_file), "rb") as file:
            pdf = pdftotext.PDF(file, physical=True)
            if len(pdf) == 1:
                first_page = pdf[0]
                lines1 = first_page.split("\n")
                lines1 = [i.lstrip() for i in lines1]
                lines1 = [i for i in lines1 if i != '']
                lines2 = None
            elif len(pdf) == 2:
                first_page = pdf[0]
                lines1 = first_page.split("\n")
                second_page = pdf[1]
                lines2 = second_page.split("\n")

                lines1 = [i.lstrip() for i in lines1]
                lines1 = [i for i in lines1 if i != '']
                lines2 = [i.lstrip() for i in lines2]
                lines2 = [i for i in lines2 if i != '']
            else:
                pdf_length = len(pdf)
                raise Exception(f'New length for pdf ({pdf_length}), time to set new rules. Info: {bank}, {year}, {month}')

        if lines2 is None:
            transactions_forward_list = lines1[lines1.index('ACCOUNT ACTIVITY'):]
        else:
            transactions_forward_list = lines1[lines1.index('ACCOUNT ACTIVITY'):] + lines2[lines2.index('ACCOUNT ACTIVITY (continued)'):]

        transaction_lines_list = transactions_forward_list
        transaction_lines_list = transaction_lines_list.copy()
        beginning_balance_list = transaction_lines_list.copy()

        # get the beginning entry for reference
        beginning_balance_entry = [i for i in beginning_balance_list if 'Beginning Balance' in i][0]
        beginning_balance_dict = re.search(
            pattern=(
                r"(?P<transaction_date>\d+/\d+/\d+)\s*"
                r"(?P<description>.*?)(?=\$)"
                r"(?P<balance>.*)"
            ),
            string=beginning_balance_entry
        ).groupdict()

        # parse the other lines
        date_pattern = r"^\d{2}/\d{2}/\d{4}"
        bad_idx_list = []
        for idx, txc_line in enumerate(transaction_lines_list):
            if not bool(re.match(date_pattern, txc_line[:10])):
                bad_idx_list.append(idx)

        transaction_lines_list = remove_indices(transaction_lines_list, bad_idx_list)
        transaction_lines_list = [
            i for i in transaction_lines_list if all(substring not in i for substring in ['Beginning Balance', 'Ending Balance'])
        ]

        transactions_df = parse_lines_with_regex(transaction_lines_list, transaction_pattern)
        transactions_df['credit_debit'] = transactions_df['credit_debit'].apply(currency_to_float)
        transactions_df['balance'] = transactions_df['balance'].apply(currency_to_float)
        beginning_balance_df = pd.DataFrame([beginning_balance_dict])
        beginning_balance_df['balance'] = beginning_balance_df['balance'].apply(currency_to_float)
        transactions_df = pd.concat(
            [
                beginning_balance_df,
                transactions_df
            ]
        )
        transactions_df['description'] = transactions_df['description'].apply(lambda x: x.rstrip())

        # for reversal charges which have negative values in the credit statement
        transactions_df['credit_debit'] = abs(transactions_df['credit_debit'])

        credit_debit_multiplier_list = []
        transactions_df.reset_index(drop=True, inplace=True) # fix idx for the iterrows
        for idx, row in transactions_df.iterrows():
            cur_balance = row['balance']
            if idx > 0:
                if cur_balance > prev_balance:
                    credit_debit_multiplier = 1
                else:
                    credit_debit_multiplier = -1
            else:
                credit_debit_multiplier = 1
            credit_debit_multiplier_list.append(credit_debit_multiplier)
            prev_balance = row['balance']

        transactions_df['credit_debit_multiplier'] = credit_debit_multiplier_list
        transactions_df['credit_debit'] = transactions_df['credit_debit'] * transactions_df['credit_debit_multiplier']
        transactions_df.drop(['credit_debit_multiplier'], axis=1, inplace=True)
        transactions_df = transactions_df.iloc[1:,:].copy() # drop the Beginning Balance
        transactions_df.sort_values(by='transaction_date', ascending=True, inplace=True)
        transactions_df_list.append(transactions_df)

    return pd.concat(transactions_df_list)

## Need to delete the extra pages in the pdf

In [11]:
marcus_annual_beginning_ending_balance_df = pd.DataFrame({
    '2021': [0, 11719.53],
    '2022': [11719.53, 4877.32],
    '2023': [4877.32, 28770.03],
    '2024': [28770.03, 30478.36],
})

def approx_sum(x):
    return round(sum(x), 2)

def test_annual_balance_marcus(df, year, ref_df):
    tmp = round(ref_df[year][0] + approx_sum(df['credit_debit']), 2)
    assert tmp == ref_df[year][1]

transactions_2021_df = parse_marcus_bank_statements_by_year(
    bank='marcus',
    year='2021',
    bank_statements_file_path='../data/bank_statements',
)
transactions_2022_df = parse_marcus_bank_statements_by_year(
    bank='marcus',
    year='2022',
    bank_statements_file_path='../data/bank_statements',
)
transactions_2023_df = parse_marcus_bank_statements_by_year(
    bank='marcus',
    year='2023',
    bank_statements_file_path='../data/bank_statements',
)
transactions_2024_df = parse_marcus_bank_statements_by_year(
    bank='marcus',
    year='2024',
    bank_statements_file_path='../data/bank_statements',
)

# test for regressions
test_annual_balance_marcus(
    df=transactions_2021_df,
    year='2021',
    ref_df=marcus_annual_beginning_ending_balance_df
)
test_annual_balance_marcus(
    df=transactions_2022_df,
    year='2022',
    ref_df=marcus_annual_beginning_ending_balance_df
)
test_annual_balance_marcus(
    df=transactions_2023_df,
    year='2023',
    ref_df=marcus_annual_beginning_ending_balance_df
)
test_annual_balance_marcus(
    df=transactions_2024_df,
    year='2024',
    ref_df=marcus_annual_beginning_ending_balance_df
)

transactions_df = pd.concat([
    transactions_2021_df,
    transactions_2022_df,
    transactions_2023_df,
    transactions_2024_df,
])

transactions_df['transaction_date'] = pd.to_datetime(transactions_df['transaction_date'])
transactions_df.sort_values(by='transaction_date', ascending=True, inplace=True)
transactions_df.reset_index(drop=True, inplace=True)
transactions_df['transaction_date'] = transactions_df['transaction_date'].astype(str)

In [13]:
gc = gspread.service_account()
finance_tracker_db_spreadsheet = gc.open_by_key('1CAyyf2kr-pS7LNX1a_0ithw6niL3Js3K4ZEOlwDViZY')
marcus_worksheet = finance_tracker_db_spreadsheet.worksheet('marcus_bank_statements')
marcus_worksheet.update([transactions_df.columns.values.tolist()] + transactions_df.values.tolist())
marcus_worksheet.format("C:D", {"numberFormat": {"type": "CURRENCY"}})

{'spreadsheetId': '1CAyyf2kr-pS7LNX1a_0ithw6niL3Js3K4ZEOlwDViZY',
 'replies': [{}]}

# East West Bank Credit Card

In [542]:
def parse_lines_with_regex(lines, transaction_pattern):
    # Ref.: https://levelup.gitconnected.com/creating-a-bank-statement-parser-with-python-9223b895ebae
    transactions = []
    for line in lines:
        match = re.search(pattern=transaction_pattern, string=line)
        if match:
            transactions.append(match.groupdict())

    return pd.DataFrame(transactions)

transaction_pattern = (
    r"(?P<post_date>\d{2}/\d{2})\s*"
    r"(?P<transaction_date>\d{2}/\d{2})\s*"
    r"(?P<ref_num>\S*)\s*"
    r"(?P<description>.*?)(?=\$)"
    r"(?P<amount>\S.*)"
)

def clean_amount_col(x):
    # remove ($), (,), (alphabetical)
    return float(re.sub(r"[^\d|\.]", "", x.replace('$', '')))

def check_list_len_bool(l):
    if len(l) > 0:
        return True
    else:
        return False

def remove_indices(lst, indices):
    # Sort indices in descending order to avoid reindexing issues
    indices = sorted(indices, reverse=True)
    
    # Remove elements at each index
    for index in indices:
        if 0 <= index < len(lst):
            lst.pop(index)
        else:
            raise IndexError(f"Index {index} is out of bounds for list of length {len(lst)}")
    
    return lst

def drop_multiline_transactions(transaction_lines):
    transaction_lines = transaction_lines.copy()
    bad_idx_list = []
    for idx, i in enumerate(transaction_lines):
        # if not bool(re.match(r"^(\d{2}/\d{2})\s*(\d{2}/\d{2})", i)):
        if not bool(re.match(r"^(\d{2}/\d{2})\s*", i)):
            bad_idx_list.append(idx)
    transaction_lines = remove_indices(lst=transaction_lines, indices=bad_idx_list)
    return transaction_lines

def insert_na_for_missing_txn_date_and_ref_num(transaction_lines):
    # works only for when there's a post date and no transaction date
    null_idx_list = []
    for idx, i in enumerate(transaction_lines):
        if bool(re.match(r"^(\d{2}/\d{2})\s{8,}", i)):
            null_idx_list.append(idx)

    for idx in null_idx_list:
        transaction_lines[idx] = transaction_lines[idx][:8] + \
            transaction_lines[idx][:5] + \
            '    NA' + \
            transaction_lines[idx][10:]
    return transaction_lines

def process_transaction_lines(payments_and_other_credits_lines=[], purchases_and_other_debits_lines=[], fees_lines=[]):
    """
    Return the transactions df from the transaction lines.
    """
    if payments_and_other_credits_lines and purchases_and_other_debits_lines and not fees_lines:
        payments_and_other_credits_lines = drop_multiline_transactions(payments_and_other_credits_lines)
        purchases_and_other_debits_lines = drop_multiline_transactions(purchases_and_other_debits_lines)

        payments_and_other_credits_lines = insert_na_for_missing_txn_date_and_ref_num(payments_and_other_credits_lines)
        purchases_and_other_debits_lines = insert_na_for_missing_txn_date_and_ref_num(purchases_and_other_debits_lines)

        payments_and_other_credits_df = parse_lines_with_regex(payments_and_other_credits_lines, transaction_pattern)
        purchases_and_other_debits_df = parse_lines_with_regex(purchases_and_other_debits_lines, transaction_pattern)

        payments_and_other_credits_df['amount'] = payments_and_other_credits_df['amount'].apply(clean_amount_col)
        purchases_and_other_debits_df['amount'] = purchases_and_other_debits_df['amount'].apply(clean_amount_col)

        payments_and_other_credits_df['amount'] = payments_and_other_credits_df['amount'] * -1 # make negative

        transactions_df = pd.concat([payments_and_other_credits_df, purchases_and_other_debits_df])
        transactions_df.drop(['ref_num'], axis=1, inplace=True)
        return transactions_df
    elif payments_and_other_credits_lines and not fees_lines:
        payments_and_other_credits_lines = drop_multiline_transactions(payments_and_other_credits_lines)

        payments_and_other_credits_lines = insert_na_for_missing_txn_date_and_ref_num(payments_and_other_credits_lines)

        payments_and_other_credits_df = parse_lines_with_regex(payments_and_other_credits_lines, transaction_pattern)

        payments_and_other_credits_df['amount'] = payments_and_other_credits_df['amount'].apply(clean_amount_col)

        payments_and_other_credits_df.drop(['ref_num'], axis=1, inplace=True)
        return payments_and_other_credits_df
    elif purchases_and_other_debits_lines and not fees_lines:
        purchases_and_other_debits_lines = drop_multiline_transactions(purchases_and_other_debits_lines)

        purchases_and_other_debits_lines = insert_na_for_missing_txn_date_and_ref_num(purchases_and_other_debits_lines)

        purchases_and_other_debits_df = parse_lines_with_regex(purchases_and_other_debits_lines, transaction_pattern)

        purchases_and_other_debits_df['amount'] = purchases_and_other_debits_df['amount'].apply(clean_amount_col)

        purchases_and_other_debits_df.drop(['ref_num'], axis=1, inplace=True)
        return purchases_and_other_debits_df
    elif payments_and_other_credits_lines and purchases_and_other_debits_lines and fees_lines:
        payments_and_other_credits_lines = drop_multiline_transactions(payments_and_other_credits_lines)
        purchases_and_other_debits_lines = drop_multiline_transactions(purchases_and_other_debits_lines)
        fees_lines = drop_multiline_transactions(fees_lines)

        payments_and_other_credits_lines = insert_na_for_missing_txn_date_and_ref_num(payments_and_other_credits_lines)
        purchases_and_other_debits_lines = insert_na_for_missing_txn_date_and_ref_num(purchases_and_other_debits_lines)
        fees_lines = insert_na_for_missing_txn_date_and_ref_num(fees_lines)

        payments_and_other_credits_df = parse_lines_with_regex(payments_and_other_credits_lines, transaction_pattern)
        purchases_and_other_debits_df = parse_lines_with_regex(purchases_and_other_debits_lines, transaction_pattern)
        fees_lines_df = parse_lines_with_regex(fees_lines, transaction_pattern)

        payments_and_other_credits_df['amount'] = payments_and_other_credits_df['amount'].apply(clean_amount_col)
        purchases_and_other_debits_df['amount'] = purchases_and_other_debits_df['amount'].apply(clean_amount_col)
        fees_lines_df['amount'] = fees_lines_df['amount'].apply(clean_amount_col)

        payments_and_other_credits_df['amount'] = payments_and_other_credits_df['amount'] * -1 # make negative

        transactions_df = pd.concat([payments_and_other_credits_df, purchases_and_other_debits_df, fees_lines_df])
        transactions_df.drop(['ref_num'], axis=1, inplace=True)
        return transactions_df

def add_year_to_date(first_page, transactions_df):
    """
    Add the year to the date columns. Handle cases with two years (Jan/Dec).
    """
    date_range = first_page.split("\n")[0].split('Statement')[1].split('Page')[0].lstrip().rstrip()
    start_date = date_range.split(' - ')[0]
    end_date = date_range.split(' - ')[1]
    start_year = start_date[6:]
    start_month = start_date[:2]
    end_year = end_date[6:]
    end_month = end_date[:2]
    if start_year == end_year:
        transactions_df['post_date'] = transactions_df['post_date'] + '/' + start_year
        transactions_df['transaction_date'] = transactions_df['transaction_date'] + '/' + start_year
    else:
        month_year_mapping = {
            start_month: start_year,
            end_month: end_year,
        }
        transactions_df['post_date_month'] = transactions_df['post_date'].apply(lambda x: x.split('/')[0])
        transactions_df['transaction_date_month'] = transactions_df['transaction_date'].apply(lambda x: x.split('/')[0])
        transactions_df['post_date'] = transactions_df['post_date'] + \
            '/' + \
            transactions_df['post_date_month'].apply(lambda x: month_year_mapping[x])
        transactions_df['transaction_date'] = transactions_df['transaction_date'] + \
            '/' + \
            transactions_df['transaction_date_month'].apply(lambda x: month_year_mapping[x])
        transactions_df.drop(['post_date_month', 'transaction_date_month'], axis=1, inplace=True)

    transactions_df['date_range'] = date_range
    return transactions_df

In [747]:
credit_card='east_west_bank'
year='2022'
credit_card_statements_file_path='../data/credit_card_statements'

file_path = os.path.join(credit_card_statements_file_path, credit_card, year)
monthly_credit_card_statement_list = os.listdir(file_path)
monthly_credit_card_statement_list = [i for i in monthly_credit_card_statement_list if i != '.DS_Store']
transactions_df_list = []

# for monthly_credit_card_statement_file in monthly_credit_card_statement_list:
monthly_credit_card_statement_file = monthly_credit_card_statement_list[1]
month = monthly_credit_card_statement_file.split('_')[1].split('.')[0]

with open(os.path.join(file_path, monthly_credit_card_statement_file), "rb") as file:
    pdf = pdftotext.PDF(file, physical=True)
    # if len(pdf) == 1:
    #     first_page = pdf[0]
    #     lines1 = first_page.split("\n")
    #     lines1 = [i.lstrip() for i in lines1]
    #     lines1 = [i for i in lines1 if i != '']
    #     lines2 = None
    # elif len(pdf) == 2:
    #     first_page = pdf[0]
    #     lines1 = first_page.split("\n")
    #     second_page = pdf[1]
    #     lines2 = second_page.split("\n")

    #     lines1 = [i.lstrip() for i in lines1]
    #     lines1 = [i for i in lines1 if i != '']
    #     lines2 = [i.lstrip() for i in lines2]
    #     lines2 = [i for i in lines2 if i != '']
    # else:
    #     pdf_length = len(pdf)
    #     raise Exception(f'New length for pdf ({pdf_length}), time to set new rules. Info: {bank}, {year}, {month}')

In [748]:
monthly_credit_card_statement_list

['2022_02.pdf',
 '2022_03.pdf',
 '2022_01.pdf',
 '2022_10.pdf',
 '2022_04.pdf',
 '2022_05.pdf',
 '2022_11.pdf',
 '2022_07.pdf',
 '2022_12.pdf',
 '2022_06.pdf',
 '2022_08.pdf',
 '2022_09.pdf']

In [749]:
monthly_credit_card_statement_file

'2022_03.pdf'

In [750]:
# edge cases: multi-line transactions
# 2023_08 multi-line (4) across two pages
# edge cases: multi-page transactions
# edge cases: Payments and Other Credits (may or may not exist)
# edge cases: Purchases and Other Debits (may or may not exist) (can be on first or second page)
# edge cases: trans date and ref num missing
# edge cases: trans date missing
# case 0: Payments and Other Credits exists (page 1), Purchases and Other Debits exists (page 2)
# 2024_02, 2022_03, 2022_05
# case 1: Payments and Other Credits exists (page 1), Purchases and Other Debits exists (pages 1 and 2)
# 2024_01, 2023_01, 2023_02?, 2023_03, 2023_04, 2023_05, 2023_07, 2023_08, 2023_09, 2023_12, 2022_04, 2022_06, 2022_09, 2022_12
# case 2: Payments and Other Credits exists (page 1), Purchases and Other Debits exists (page 2)
# case 6: Payments and Other Credits exists (page 1), Purchases and Other Debits doesn't exist
# case 3: Payments and Other Credits doesn't exist, Purchases and Other Debits exists (pages 1 and 2)
# 2024_05
# case 4: Payments and Other Credits doesn't exist, Purchases and Other Debits exists (page 1)
# 2024_03, 2024_04
# case 5: Payments and Other Credits exists (page 1), Purchases and Other Debits exists (page 1)
# 2023_10
# case 7: Payments and Other Credits exists (page 1), Purchases and Other Debits exists (pages 1 and 2), Fees (page 2)
# 2023_06,  2023_11, 2022_11
# case 8: Payments and Other Credits exists (page 1), Purchases and Other Debits exists (page 2), Fees (page 2)
# 2022_01, 2022_07, 2022_08
# case 9: Payments and Other Credits exists (pages 1 and 2), Purchases and Other Debits exists (pages 2 and 3), Fees (page 3)
# 2022_02
# case 11: Payments and Other Credits exists (pages 1 and 2), Purchases and Other Debits exists (page 2)
# 2022_10
# case 12: Payments and Other Credits exists (pages 1 and 2), Purchases and Other Debits exists (page 2), Fees (page 2)
# 2021_02!!!
# case 13: Payments and Other Credits exists (pages 1 and 2), Purchases and Other Debits exists (page 2), Fees (page 3)
# 2021_04!!!
# case 14: Payments and Other Credits exists (pages 1 and 2), Purchases and Other Debits exists (page 2), Fees (pages 3 and 4)
# 2021_05!!!

# edge case:
# 2021_03

In [751]:
payments_and_other_credits_line1, payments_and_other_credits_line2, purchases_and_other_debits_line1, purchases_and_other_debits_line2, fees_line2

(['Payments and Other Credits'],
 ['Payments and Other Credits'],
 [],
 ['Purchases and Other Debits'],
 ['Fees'])

In [752]:
def check_for_indices(lines, keyword, exact_match=True):
    idx_list = []
    for idx, i in enumerate(lines):
        if exact_match:
            if i == keyword:
                idx_list.append(idx)
        else:
            if i[:len(keyword)] == keyword:
                idx_list.append(idx)
    return idx_list

In [753]:
# collect lines
lines_list = []
for page in pdf:
    lines = page.split("\n")
    lines = [i.lstrip() for i in lines]
    lines = [i for i in lines if i != '']
    if [i for i in lines if 'Transactions' in i]:
        lines_list += lines

payments_and_other_credits_start_idx_list = check_for_indices(lines_list, 'Payments and Other Credits')
purchases_and_other_debits_start_idx_list = check_for_indices(lines_list, 'Purchases and Other Debits')
total_this_period_idx_list = check_for_indices(lines_list, 'TOTAL THIS PERIOD', False)
fees_start_idx_list = check_for_indices(lines_list, 'Fees')
total_fees_this_period_idx_list = check_for_indices(lines_list, 'TOTAL FEES THIS PERIOD', False)
if payments_and_other_credits_start_idx_list:
    payments_and_other_credits_lines = lines_list[
        payments_and_other_credits_start_idx_list[0] + 3: \
        total_this_period_idx_list[0]
    ]
if purchases_and_other_debits_start_idx_list:
    purchases_and_other_debits_lines = lines_list[
        purchases_and_other_debits_start_idx_list[0] + 3: \
        total_this_period_idx_list[-1]
    ]
if fees_start_idx_list:
    fees_lines = lines_list[
        fees_start_idx_list[0] + 3: \
        total_fees_this_period_idx_list[0]
    ]

transactions_df = process_transaction_lines(payments_and_other_credits_lines, purchases_and_other_debits_lines, fees_lines)
transactions_df = add_year_to_date(pdf[2], transactions_df)
transactions_df_list.append(transactions_df)

In [757]:
fees_start_idx_list = check_for_indices(lines_list, 'Fees')
fees_start_idx_list

[]

In [755]:
fees_lines

['05/03   04/30   5725      FRGN TRANS FEE-luxiaojunbarbell     mi                             $3.58',
 'Continued on Next Page',
 'May 2021 Statement                 04/08/2021 - 05/07/2021                                                                              Page 4 of 4',
 '10101010101010101010',
 '10111001010010100011',
 '10100001000010001000',
 '10011101010110100111',
 '11011100001001001010',
 'JARED V YU                                                                             Cardmember Service                         1-800-558-3424',
 '11000101010000001101',
 '10010001000101011100',
 '10010000110010100001',
 '11011000001110100010',
 '11001010001001101101',
 '10000000110010000000',
 '10111001000010000011',
 '10000011111001110000',
 '10100000010010100111',
 '10001111001011110100',
 '10001110011100001111',
 '10110110100111010100',
 '11010101111001011101',
 '10011100001010111010',
 '11111111111111111111   Transactions',
 'Fees',
 'Post       Trans',
 'Date       Date       

In [754]:
transactions_df

Unnamed: 0,post_date,transaction_date,description,amount,date_range
0,02/10/2022,02/10/2022,PAYMENT THANK YOU ...,-560.8,02/08/2022 - 03/09/2022
1,02/18/2022,02/18/2022,PAYMENT THANK YOU ...,-411.8,02/08/2022 - 03/09/2022
2,02/23/2022,02/23/2022,PAYMENT THANK YOU ...,-3554.9,02/08/2022 - 03/09/2022
3,03/04/2022,03/04/2022,PAYMENT THANK YOU ...,-180.79,02/08/2022 - 03/09/2022
0,02/08/2022,02/07/2022,UCSB CRTYD CAFE Q01 SANTA BARBARA CA ...,22.71,02/08/2022 - 03/09/2022
1,02/09/2022,02/09/2022,APPLE.COM/BILL 866-712-7753 CA ...,2.99,02/08/2022 - 03/09/2022
2,02/14/2022,02/11/2022,ALBERTSONS #0354 GOLETA CA ...,15.36,02/08/2022 - 03/09/2022
3,02/14/2022,02/12/2022,WATERFRONT PARKING SANTA BARBARA CA ...,2.0,02/08/2022 - 03/09/2022
4,02/14/2022,02/12/2022,TRADER JOE'S #059 QPS SANTA BARBARA CA ...,5.63,02/08/2022 - 03/09/2022
5,02/14/2022,02/12/2022,SQ *THE CULTURED AB GOLETA CA ...,20.0,02/08/2022 - 03/09/2022


In [None]:
credit_card='east_west_bank'
year='2023'
credit_card_statements_file_path='../data/credit_card_statements'

file_path = os.path.join(credit_card_statements_file_path, credit_card, year)
monthly_credit_card_statement_list = os.listdir(file_path)
monthly_credit_card_statement_list = [i for i in monthly_credit_card_statement_list if i != '.DS_Store']
transactions_df_list = []

# for monthly_credit_card_statement_file in monthly_credit_card_statement_list:
monthly_credit_card_statement_file = monthly_credit_card_statement_list[5]
month = monthly_credit_card_statement_file.split('_')[1].split('.')[0]

with open(os.path.join(file_path, monthly_credit_card_statement_file), "rb") as file:
    pdf = pdftotext.PDF(file, physical=True)

In [758]:
def parse_east_west_bank_credit_card_statements_by_year(credit_card, year, credit_card_statements_file_path):
    """
    Go through a year of monthly bank statements for a given bank and parse
    the statements and return a df.
    """
    file_path = os.path.join(credit_card_statements_file_path, credit_card, year)
    monthly_credit_card_statement_list = os.listdir(file_path)
    monthly_credit_card_statement_list = [i for i in monthly_credit_card_statement_list if i != '.DS_Store']
    transactions_df_list = []
    for monthly_credit_card_statement_file in monthly_credit_card_statement_list:
        month = monthly_credit_card_statement_file.split('_')[1].split('.')[0]
        with open(os.path.join(file_path, monthly_credit_card_statement_file), "rb") as file:
            pdf = pdftotext.PDF(file, physical=True)

        # collect lines
        lines_list = []
        for page in pdf:
            lines = page.split("\n")
            lines = [i.lstrip() for i in lines]
            lines = [i for i in lines if i != '']
            if [i for i in lines if 'Transactions' in i]:
                lines_list += lines

        payments_and_other_credits_start_idx_list = check_for_indices(lines_list, 'Payments and Other Credits')
        purchases_and_other_debits_start_idx_list = check_for_indices(lines_list, 'Purchases and Other Debits')
        total_this_period_idx_list = check_for_indices(lines_list, 'TOTAL THIS PERIOD', False)
        fees_start_idx_list = check_for_indices(lines_list, 'Fees')
        total_fees_this_period_idx_list = check_for_indices(lines_list, 'TOTAL FEES THIS PERIOD', False)
        if payments_and_other_credits_start_idx_list:
            payments_and_other_credits_lines = lines_list[
                payments_and_other_credits_start_idx_list[0] + 3: \
                total_this_period_idx_list[0]
            ]
        else:
            payments_and_other_credits_lines = []
        if purchases_and_other_debits_start_idx_list:
            purchases_and_other_debits_lines = lines_list[
                purchases_and_other_debits_start_idx_list[0] + 3: \
                total_this_period_idx_list[-1]
            ]
        else:
            purchases_and_other_debits_lines = []
        if fees_start_idx_list:
            fees_lines = lines_list[
                fees_start_idx_list[0] + 3: \
                total_fees_this_period_idx_list[0]
            ]
        else:
            fees_lines = []

        transactions_df = process_transaction_lines(payments_and_other_credits_lines, purchases_and_other_debits_lines, fees_lines)
        transactions_df = add_year_to_date(pdf[2], transactions_df)
        transactions_df_list.append(transactions_df)
    return pd.concat(transactions_df_list)

In [759]:
transactions_2022_df = parse_east_west_bank_credit_card_statements_by_year(
    credit_card='east_west_bank',
    year='2022',
    credit_card_statements_file_path='../data/credit_card_statements',
)
# transactions_2023_df = parse_east_west_bank_credit_card_statements_by_year(
#     credit_card='east_west_bank',
#     year='2023',
#     credit_card_statements_file_path='../data/credit_card_statements',
# )
# transactions_2024_df = parse_east_west_bank_credit_card_statements_by_year(
#     credit_card='east_west_bank',
#     year='2024',
#     credit_card_statements_file_path='../data/credit_card_statements',
# )

In [761]:
beginning_balance = 99.31
# beginning_balance = 137.52
# beginning_balance = 82.56
annual_sum = round(sum(transactions_2022_df['amount']), 2)
beginning_balance + annual_sum, annual_sum

(137.52, 38.21)

In [762]:
test_df = transactions_2022_df.copy()
test_df['start_date'] = test_df['date_range'].apply(lambda x: x.split(' - ')[0])
test_df['start_date'] = pd.to_datetime(test_df['start_date'])
test_df.sort_values(by=['start_date', 'post_date'], ascending=[True, True], inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [763]:
start_date_list = list(set(test_df['start_date']))
start_date_list = sorted(start_date_list)

In [764]:
pd.set_option('display.max_rows', 500)

In [765]:
# transactions_2022_df

In [766]:
# round(sum(test_df.iloc[0:34]['amount']), 2)
test_df2 = test_df.loc[test_df['start_date'] == start_date_list[2]].copy()
tot_amt = round(sum(test_df2['amount']), 2)
# 99.31 + tot_amt, tot_amt # Jan
# 0 + tot_amt, tot_amt # Feb
535.10 + tot_amt, tot_amt # Mar



# 176.27 + tot_amt, tot_amt

(2.990000000000009, -532.11)

In [767]:
# test_df2.loc[test_df2['amount'] > 0]

In [768]:
transactions_2022_df['transaction_date'] = pd.to_datetime(transactions_2022_df['transaction_date'])
transactions_2022_df['post_date'] = pd.to_datetime(transactions_2022_df['post_date'])
transactions_2022_df.sort_values(by='post_date', ascending=True, inplace=True)
transactions_2022_df.reset_index(drop=True, inplace=True)

In [773]:
east_west_bank_credit_card_annual_beginning_ending_balance_df = pd.DataFrame({
    '2020': [45.21, 1080.03],
    '2021': [1080.03, 99.31],
    '2022': [99.31, 137.52],
    '2023': [137.52, 82.56],
    '2024': [82.56, -275.03],
})

transactions_2020_df = parse_east_west_bank_credit_card_statements_by_year(
    credit_card='east_west_bank',
    year='2020',
    credit_card_statements_file_path='../data/credit_card_statements',
)
transactions_2021_df = parse_east_west_bank_credit_card_statements_by_year(
    credit_card='east_west_bank',
    year='2021',
    credit_card_statements_file_path='../data/credit_card_statements',
)
transactions_2022_df = parse_east_west_bank_credit_card_statements_by_year(
    credit_card='east_west_bank',
    year='2022',
    credit_card_statements_file_path='../data/credit_card_statements',
)
transactions_2023_df = parse_east_west_bank_credit_card_statements_by_year(
    credit_card='east_west_bank',
    year='2023',
    credit_card_statements_file_path='../data/credit_card_statements',
)
transactions_2024_df = parse_east_west_bank_credit_card_statements_by_year(
    credit_card='east_west_bank',
    year='2024',
    credit_card_statements_file_path='../data/credit_card_statements',
)

def approx_sum(x):
    return round(sum(x), 2)

def test_annual_balance_east_west_bank_credit_card(df, year, ref_df):
    tmp = round(ref_df[year][0] + approx_sum(df['amount']), 2)
    assert tmp == ref_df[year][1]

# test for regressions
test_annual_balance_east_west_bank_credit_card(
    df=transactions_2020_df,
    year='2020',
    ref_df=east_west_bank_credit_card_annual_beginning_ending_balance_df
)
test_annual_balance_east_west_bank_credit_card(
    df=transactions_2021_df,
    year='2021',
    ref_df=east_west_bank_credit_card_annual_beginning_ending_balance_df
)
test_annual_balance_east_west_bank_credit_card(
    df=transactions_2022_df,
    year='2022',
    ref_df=east_west_bank_credit_card_annual_beginning_ending_balance_df
)
test_annual_balance_east_west_bank_credit_card(
    df=transactions_2023_df,
    year='2023',
    ref_df=east_west_bank_credit_card_annual_beginning_ending_balance_df
)
test_annual_balance_east_west_bank_credit_card(
    df=transactions_2024_df,
    year='2024',
    ref_df=east_west_bank_credit_card_annual_beginning_ending_balance_df
)

total_transactions_df = pd.concat([
    transactions_2020_df,
    transactions_2021_df,
    transactions_2022_df,
    transactions_2023_df,
    transactions_2024_df
])

total_transactions_df['post_date_fmt'] = pd.to_datetime(total_transactions_df['post_date'])
total_transactions_df['transaction_date_fmt'] = pd.to_datetime(total_transactions_df['transaction_date'])
total_transactions_df.sort_values(
    by=['post_date_fmt', 'transaction_date_fmt', 'amount', 'description'],
    ascending=[True, True, True, True],
    inplace=True
)
total_transactions_df.drop(['post_date_fmt', 'transaction_date_fmt'], axis=1, inplace=True)
total_transactions_df.reset_index(drop=True, inplace=True)

TypeError: 'NoneType' object is not subscriptable

In [771]:
total_transactions_df

Unnamed: 0,post_date,transaction_date,description,amount,date_range
0,12/09/2021,12/07/2021,ALBERTSONS #0354 GOLETA CA ...,19.82,12/09/2021 - 01/07/2022
1,12/10/2021,12/09/2021,APPLE.COM/BILL 866-712-7753 CA ...,2.99,12/09/2021 - 01/07/2022
2,12/13/2021,12/10/2021,WASH LAUNDRY WAVERIDER EL SEGUNDO CA ...,5.50,12/09/2021 - 01/07/2022
3,12/13/2021,12/10/2021,COSTCO WHSE #0474 GOLETA CA ...,43.08,12/09/2021 - 01/07/2022
4,12/13/2021,12/11/2021,ALBERTSONS #0354 GOLETA CA ...,3.49,12/09/2021 - 01/07/2022
...,...,...,...,...,...
496,04/25/2024,04/23/2024,TCB*MTA METER MTA P SAN FRANCISCO CA ...,28.13,04/06/2024 - 05/08/2024
497,04/29/2024,04/25/2024,TCB*MTA METER MTA P SAN FRANCISCO CA ...,1.73,04/06/2024 - 05/08/2024
498,04/29/2024,04/27/2024,SWEETFIN WWW.SWEETFIN. CA ...,2.95,04/06/2024 - 05/08/2024
499,04/29/2024,04/27/2024,TCB*MTA METER MTA P 4157012311 CA ...,8.85,04/06/2024 - 05/08/2024


# Discover Credit Card