In [1]:
import re
import xlrd

STATEMENT_FILE = 'Fatura-Excel.xls'

book = xlrd.open_workbook(STATEMENT_FILE)

print("The number of worksheets is {0}".format(book.nsheets))
print("Worksheet name(s): {0}".format(book.sheet_names()))
sh = book.sheet_by_index(0)
print("{0} {1} {2}".format(sh.name, sh.nrows, sh.ncols))
# for rx in range(sh.nrows):
#     print(sh.row(rx))

In [3]:
# Defined Cell types
XL_CELL_EMPTY = 0
XL_CELL_TEXT = 1
XL_CELL_NUMBER = 2

# Column info location
DATE_COLUMN = 0
TEXT_COLUMN = 1
VALUE_COLUMN = 3

# Regex for identifying date dd/mm/yyyy
date_regex = re.compile(r'^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)\d{4}$')

In [66]:
people = ['VITOR HENRIQUE DE MORAES E', 'LARA JEVEAUX ALVES']

expenses = {key: list([]) for key in people}

# learning moment:
# expenses = dict.fromkeys(people, [])
# when supplying a mutable object as default value, a deep copy is made. Therefore, appending values
# in the original list will take places in all the values of keys. In this case they were duplicated 

def formatExpenseDict(date, text, value):
    return {'date': date, 'text': text, 'value': value} 

# Scan all rows and group expenses by person
current_person = ''
for rx in range(sh.nrows):
    first_cell = sh.cell_value(rowx=rx, colx=0)
   
    if any(person in first_cell for person in people):
        current_person = first_cell.rsplit(' -')[0]
        continue;
    
    first_cell_type = sh.cell_type(rowx=rx, colx=DATE_COLUMN)
    if (first_cell_type == XL_CELL_TEXT and date_regex.search(first_cell) != None):
        # group expenses when row is valid
        date_cell = first_cell
        text_cell = sh.cell_value(rowx=rx, colx=TEXT_COLUMN)
        value_cell = sh.cell_value(rowx=rx, colx=VALUE_COLUMN)
        
        expense = formatExpenseDict(date_cell, text_cell, value_cell)

        expenses[current_person].append(expense)
        # print(current_person, expense)

In [67]:
from datetime import date

# REMOVING EXPENSES BEFORE SELECTED DATE

IGNORE_EXPENSES_BEFORE_DATE = '16/04/2022'

def transform_date(str_date = '31/12/2021'):
    day, month, year = str_date.rsplit('/')
    return date(int(year), int(month), int(day))

last_date = transform_date(IGNORE_EXPENSES_BEFORE_DATE)

for person, expensesList in expenses.items():
    indexes_to_remove = []
    for idx, expense in enumerate(expensesList):
        current_date = transform_date(expense['date'])
        if (current_date < last_date):
            indexes_to_remove.append(idx)
    
    # the removal operation had to be separated because it wasn't fully functional inside the loop above
    # it is reversed in order to not mess up with the indexes
    for jdx in reversed(indexes_to_remove):
        expenses[person].pop(jdx)

In [68]:
# CLEAR REFUNDS DISGUISED AS EXPENSES
for person, expensesList in expenses.items():
    for idx, expense in enumerate(expensesList):
         if (expense['value'] < 0):
            refund = expense
            # searches refunded expense retroactively, expeting it to be near and above
            for jdx in reversed(range(idx)):
                if (abs(expense['value']) == abs(expensesList[jdx]['value'])):
                    refundedExpense = expensesList[jdx]
                    expenses[person].remove(refund) 
                    expenses[person].remove(refundedExpense)
                    break

In [70]:
for person, expensesList in expenses.items():
    for idx, expense in enumerate(expensesList):
        print(person, expense)

VITOR HENRIQUE DE MORAES E {'date': '16/04/2022', 'text': 'Dom Esfiha P-ct', 'value': 97.81}
VITOR HENRIQUE DE MORAES E {'date': '16/04/2022', 'text': 'Pag*bakeryte-ct', 'value': 25.26}
VITOR HENRIQUE DE MORAES E {'date': '17/04/2022', 'text': 'Edmar Lima A-ct Da', 'value': 33.0}
VITOR HENRIQUE DE MORAES E {'date': '19/04/2022', 'text': 'Pag*pastel  -ct', 'value': 24.5}
VITOR HENRIQUE DE MORAES E {'date': '19/04/2022', 'text': 'Carone Gaivo-ct', 'value': 62.16}
VITOR HENRIQUE DE MORAES E {'date': '20/04/2022', 'text': '99*gilmar Ferreira Barr', 'value': 27.48}
VITOR HENRIQUE DE MORAES E {'date': '21/04/2022', 'text': 'Churrascaria-ct Lista', 'value': 42.92}
VITOR HENRIQUE DE MORAES E {'date': '21/04/2022', 'text': 'Paygo*40 Sab-ct  Gaivo', 'value': 16.8}
VITOR HENRIQUE DE MORAES E {'date': '23/04/2022', 'text': 'Nossafarma F-ct Cia Lt', 'value': 29.9}
VITOR HENRIQUE DE MORAES E {'date': '24/04/2022', 'text': 'Padaria Puro-ct Go', 'value': 56.19}
VITOR HENRIQUE DE MORAES E {'date': '24/