In [30]:
from openpyxl import load_workbook
import pandas as pd
import re
from typing import *

def load_xlsx(filename: str = 'data/Xi 2023-24 02_September.xlsx') -> pd.DataFrame:
    # Load raw data from the Excel workbook starting from row 6
    book = load_workbook(filename, data_only=False)

    # Get the required sheet
    sheet = book.active
    data = [row for row in sheet.values]
    col_names = list(data[5])
    col_names[1] = 'USD'
    data_rows = data[6:]
    df = pd.DataFrame(data_rows, columns=col_names)

    # df = df.drop(columns=['foot'])
    df = df.drop(df.index[0])


    date_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}.*')
    first_invalid_date_index = df['Date'].apply(lambda x: pd.isnull(x) or not date_pattern.match(str(x))).idxmax()
    # print(first_invalid_date_index)
    # print(df.columns)
    df = df.loc[:, ~df.columns.duplicated()]


    # df.rename(columns={df.columns[1]: 'USD'}, inplace=True)


    df = df.iloc[:first_invalid_date_index - 1]

    return df

def load_statement(filename: str = 'data/stmt october.xlsx', xlsx_df = load_xlsx()) -> Tuple[pd.DataFrame, pd.DataFrame]:
    # Get credit_df and debit_df from the CSV file
    # Load raw data from the Excel workbook starting from row 8
    book = load_workbook(filename, data_only=False)

    # Get the bank statement sheet (converted to xlsx)
    sheet = book.active
    data = [row for row in sheet.values]
    col_names = xlsx_df.columns
    data_rows = data[8:]
    # Extend each row in data_rows with None until it matches the length of col_names
    data_rows = [list(row) + [None] * (len(col_names) - len(row)) for row in data_rows]
    df = pd.DataFrame(data_rows, columns=col_names)

    df['USD'] = df['USD'].astype(float)
    credit_df = df[df['USD'] > 0]
    debit_df = df[df['USD'] <= 0]
    return credit_df, debit_df



In [31]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

xlsx_df = load_xlsx()

xlsx_df.head(2)

Unnamed: 0,Date,USD,foot,Category,rcpt,Description,None,data,electricity,gas,vonage,waste,water,food,sum food,acctg,TEP,AILG,IFC,exec,???,licenses,house,hvac,cleaning,rush,social,large,rent
1,2023-07-03 00:00:00,-65.28,=SUM(H8:AC8)-B8,sum food,,STAR MARKET 0602 06/29 PURCHASE BOSTON MA DEBI...,,"=IF(H$6=$D8,$B8,0)","=IF(I$6=$D8,$B8,0)","=IF(J$6=$D8,$B8,0)","=IF(K$6=$D8,$B8,0)","=IF(L$6=$D8,$B8,0)","=IF(M$6=$D8,$B8,0)","=IF(N$6=$D8,$B8,0)","=IF(O$6=$D8,$B8,0)","=IF(P$6=$D8,$B8,0)","=IF(Q$6=$D8,$B8,0)","=IF(R$6=$D8,$B8,0)","=IF(S$6=$D8,$B8,0)","=IF(T$6=$D8,$B8,0)","=IF(U$6=$D8,$B8,0)","=IF(V$6=$D8,$B8,0)","=IF(W$6=$D8,$B8,0)","=IF(X$6=$D8,$B8,0)","=IF(Y$6=$D8,$B8,0)","=IF(Z$6=$D8,$B8,0)","=IF(AA$6=$D8,$B8,0)","=IF(AB$6=$D8,$B8,0)","=IF(AC$6=$D8,$B8,0)"
2,2023-07-03 00:00:00,-565.64,=SUM(H9:AC9)-B9,sum food,,BJS.COM #5490 07/01 PURCHASE 800-257-2582 MA D...,,"=IF(H$6=$D9,$B9,0)","=IF(I$6=$D9,$B9,0)","=IF(J$6=$D9,$B9,0)","=IF(K$6=$D9,$B9,0)","=IF(L$6=$D9,$B9,0)","=IF(M$6=$D9,$B9,0)","=IF(N$6=$D9,$B9,0)","=IF(O$6=$D9,$B9,0)","=IF(P$6=$D9,$B9,0)","=IF(Q$6=$D9,$B9,0)","=IF(R$6=$D9,$B9,0)","=IF(S$6=$D9,$B9,0)","=IF(T$6=$D9,$B9,0)","=IF(U$6=$D9,$B9,0)","=IF(V$6=$D9,$B9,0)","=IF(W$6=$D9,$B9,0)","=IF(X$6=$D9,$B9,0)","=IF(Y$6=$D9,$B9,0)","=IF(Z$6=$D9,$B9,0)","=IF(AA$6=$D9,$B9,0)","=IF(AB$6=$D9,$B9,0)","=IF(AC$6=$D9,$B9,0)"


In [32]:
credit_df, debit_df = load_statement()
credit_df.head(2)

Unnamed: 0,Date,USD,foot,Category,rcpt,Description,None,data,electricity,gas,vonage,waste,water,food,sum food,acctg,TEP,AILG,IFC,exec,???,licenses,house,hvac,cleaning,rush,social,large,rent
0,10/2/23,3500.0,,,,Zelle payment from EDEN SOLOMON for '23 Fall r...,,,,,,,,,,,,,,,,,,,,,,,
1,10/2/23,3500.0,,,,Zelle payment from WINNIE SZETO for rent part ...,,,,,,,,,,,,,,,,,,,,,,,


In [33]:
debit_df.head(2)


Unnamed: 0,Date,USD,foot,Category,rcpt,Description,None,data,electricity,gas,vonage,waste,water,food,sum food,acctg,TEP,AILG,IFC,exec,???,licenses,house,hvac,cleaning,rush,social,large,rent
4,10/2/23,-43.05,,,,STAR MARKET 0602 09/28 PURCHASE BOSTON MA DEBI...,,,,,,,,,,,,,,,,,,,,,,,
5,10/2/23,-395.73,,,,BJS.COM #5490 10/01 PURCHASE 800-257-2582 MA D...,,,,,,,,,,,,,,,,,,,,,,,
