In [36]:
import re
import xlrd

STATEMENT_FILE = 'Fatura-Excel.xls'

book = xlrd.open_workbook(STATEMENT_FILE)

print("The number of worksheets is {0}".format(book.nsheets))
print("Worksheet name(s): {0}".format(book.sheet_names()))
sh = book.sheet_by_index(0)
print("{0} {1} {2}".format(sh.name, sh.nrows, sh.ncols))
# for rx in range(sh.nrows):
#     print(sh.row(rx))

The number of worksheets is 1
Worksheet name(s): ['Lançamentos']
Lançamentos 112 4


In [37]:
# Defined Cell types
XL_CELL_EMPTY = 0
XL_CELL_TEXT = 1
XL_CELL_NUMBER = 2

# Column info location
DATE_COLUMN = 0
TEXT_COLUMN = 1
VALUE_COLUMN = 3

# Regex for identifying date dd/mm/yyyy
date_regex = re.compile(r'^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)\d{4}$')

In [38]:
people = ['VITOR HENRIQUE DE MORAES E', 'LARA JEVEAUX ALVES']

expenses = {key: list([]) for key in people}

# learning moment:
# expenses = dict.fromkeys(people, [])
# when supplying a mutable object as default value, a deep copy is made. Therefore, appending values
# in the original list will take places in all the values of keys. In this case they were duplicated 

def formatExpenseDict(date, text, value):
    return {'date': date, 'text': text, 'value': value} 

# Scan all rows and group expenses by person
current_person = ''
for rx in range(sh.nrows):
    first_cell = sh.cell_value(rowx=rx, colx=0)
    if any(person in first_cell for person in people):
        current_person = first_cell.rsplit(' -')[0]
        continue;
    
    if (current_person == ''):
        continue;
    
    first_cell_type = sh.cell_type(rowx=rx, colx=DATE_COLUMN)
    if (first_cell_type == XL_CELL_TEXT and date_regex.search(first_cell) != None):
        # group expenses when row is valid
        date_cell = first_cell
        text_cell = sh.cell_value(rowx=rx, colx=TEXT_COLUMN)
        value_cell = sh.cell_value(rowx=rx, colx=VALUE_COLUMN)
        
        expense = formatExpenseDict(date_cell, text_cell, value_cell)
        expenses[current_person].append(expense)
        # print(current_person, expense)

In [39]:
from datetime import date

# REMOVING EXPENSES BEFORE SELECTED DATE

IGNORE_EXPENSES_BEFORE_DATE = '12/08/2022'

def transform_date(str_date = '31/12/2021'):
    day, month, year = str_date.rsplit('/')
    return date(int(year), int(month), int(day))

last_date = transform_date(IGNORE_EXPENSES_BEFORE_DATE)

for person, expenses_list in expenses.items():
    indexes_to_remove = []
    for idx, expense in enumerate(expenses_list):
        current_date = transform_date(expense['date'])
        if (current_date < last_date):
            indexes_to_remove.append(idx)
    
    # the removal operation had to be separated because it wasn't fully functional inside the loop above
    # it is reversed in order to not mess up with the indexes
    for jdx in reversed(indexes_to_remove):
        expenses[person].pop(jdx)

In [40]:
# CLEAR REFUNDS DISGUISED AS EXPENSES
for person, expenses_list in expenses.items():
    for idx, expense in enumerate(expenses_list):
         if (expense['value'] < 0):
            refund = expense
            # searches refunded expense retroactively, expeting it to be near and above
            for jdx in reversed(range(idx)):
                if (abs(expense['value']) == abs(expenses_list[jdx]['value'])):
                    refundedExpense = expenses_list[jdx]
                    expenses[person].remove(refund) 
                    expenses[person].remove(refundedExpense)
                    break

In [41]:
investiments = ['monis']

# REMOVE INVESTIMENTS FROM EXPENSES
for person, expenses_list in expenses.items():
    for idx, expense in enumerate(expenses_list):
        if any(investiment in expense['text'].lower() for investiment in investiments):
            expenses[person].pop(idx)

In [42]:
for person, expenses_list in expenses.items():
    for idx, expense in enumerate(expenses_list):
        print(idx, person, expense)

0 VITOR HENRIQUE DE MORAES E {'date': '10/01/2023', 'text': 'Shop Cao Del-ct Y 02/02', 'value': 55.9}
1 VITOR HENRIQUE DE MORAES E {'date': '11/01/2023', 'text': 'Mk Moda Infa-ct  L02/04', 'value': 99.69}
2 VITOR HENRIQUE DE MORAES E {'date': '11/01/2023', 'text': 'Bm Import At-ct O 02/02', 'value': 125.5}
3 VITOR HENRIQUE DE MORAES E {'date': '21/01/2023', 'text': 'Decathlon Vi-ct El02/04', 'value': 101.25}
4 VITOR HENRIQUE DE MORAES E {'date': '30/01/2023', 'text': 'Vila Velha Boulevard S', 'value': 99.9}
5 VITOR HENRIQUE DE MORAES E {'date': '30/01/2023', 'text': 'Extrabom Boulevard', 'value': 156.43}
6 VITOR HENRIQUE DE MORAES E {'date': '30/01/2023', 'text': 'Precolandia -ct', 'value': 79.6}
7 VITOR HENRIQUE DE MORAES E {'date': '31/01/2023', 'text': 'Anuidade Diferenci06/12', 'value': 54.16}
8 VITOR HENRIQUE DE MORAES E {'date': '03/02/2023', 'text': 'Tribbu Fit F-ct', 'value': 28.0}
9 VITOR HENRIQUE DE MORAES E {'date': '03/02/2023', 'text': 'Pag*88beertr-ct', 'value': 69.5}
10 

## Convert to CSV, for Notion

Now that expenses are parsed, the last step involves:
- Translating texts for a more semantic representation
- Atributing the remaining csv fields, such as categories and transfers
- Then joining the expenses to generate the csv file

In [43]:
categories = {'transporte': 'Transporte',
              'viagens': 'Viagens',
              'pessoal': 'Pessoal',
              'alimentacao': 'Alimentação',
              'lazer': 'Lazer',
              'casa': 'Casa',
              'saude':'Saúde',
              'juju': 'Juju',
              'streaming': 'Streaming',
              'lanches': 'Lanches',
              'bar': 'Bar/Bebida'
             }

translate_dict = {'99*99': {'text': '99 Pay', 'category': categories['casa']},
                'vivo': {'text': 'Vivo', 'category': categories['pessoal']},
                'tbra': {'text': 'Vivo', 'category': categories['pessoal']},
                'uber': {'text': 'Uber', 'category': categories['transporte']},
                'posto ilha': {'text': 'Gasolina', 'category': categories['transporte']},
                'shellbox': {'text': 'Gasolina', 'category': categories['transporte']},
                'esfiha': {'text': 'Esfiha', 'category': categories['lanches']},
                'bakery': {'text': 'Padaria', 'category': categories['alimentacao']},
                'cr comercial': {'text': 'Padaria', 'category': categories['alimentacao']},
                'pastel': {'text': 'Pastel', 'category': categories['lanches']},
                'ifood': {'text': 'Ifood', 'category': categories['alimentacao']},
                'carone': {'text': 'Supermercado', 'category': categories['alimentacao']},
                'extrabom': {'text': 'Supermercado', 'category': categories['alimentacao']},
                'supermercado-ct im': {'text': 'Supermercado', 'category': categories['alimentacao']},
                'embutidos lo': {'text': 'Queijo Feira', 'category': categories['alimentacao']},
                '40 sab': {'text': 'Sorvete', 'category': categories['lanches']},
                'clubew': {'text': 'Wine', 'category': categories['lazer']},
                'spotify': {'text': 'Spotify', 'category': categories['streaming']},
                'farma': {'text': 'Farmácia', 'category': categories['saude']},
                'drogaria': {'text': 'Farmácia', 'category': categories['saude']},
                'petz': {'text': 'Juju', 'category': categories['juju']},
                'wendel fialh': {'text': 'Material Construção', 'category': categories['casa']},
                'rnfastfood': {'text': 'Ifood', 'category': categories['lanches']},
                'rnfastfo': {'text': 'Churrasquinho Praça', 'category': categories['lanches']},
                'tribbu': {'text': 'Açaí', 'category': categories['lanches']},
                'beertr': {'text': 'Rota Beer', 'category': categories['bar']},
                 }

In [44]:
def get_translate_item(item_text):
    not_found_item = {'text': item_text, 'category': ''}
    dict_item = [value for key, value in translate_dict.items() if key in item_text.lower()]
    
    return dict_item[0] if len(dict_item) else not_found_item

def get_payer():
    return 'Vitor'

def get_who_transfers(payer, buyer):
    return ''
    # return 'Não' if (payer.lower() in buyer.lower()) else 'Sim'

In [45]:
header = ['Mes', 'Pagamento', 'Item', 'Categoria', 'Quem pagou', 'Valor', 'Quem transfere?', 'Cartão']
data = []

for buyer, expenses_list in expenses.items():
    for idx, expense in enumerate(expenses_list):
        translated_item = get_translate_item(expense['text'])
        
        month = '01'
        pay_date = transform_date(expense['date']).strftime("%m/%d/%Y")
        description = translated_item['text']
        value = expense['value']
        payer = get_payer()
        category = translated_item['category']
        who_transfers = get_who_transfers(payer, buyer)
        card = 'PDA'
        
        expense_row = [month, pay_date, description, category, payer, value, who_transfers, card]
        data.append(expense_row)

In [46]:
import csv

with open('expenses.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write multiple rows
    writer.writerows(data)