In [None]:
import pandas as pd
from src.spreadsheet import SpreadSheet
from src.classifier import *

Constants for the spreadsheet.

In [2]:
SPREADSHEET_ID = "1NONAyZN-DU7VyRR4ystC8rDZ0aBQsEU65kTJiesV-c0"

col_mappings = {
    "month": "I",
    "date": "K",
    "cost": "L",
    "description": "M",
    "category": "N",
}

Change this path for each update:

In [3]:
TRANSACTIONS_PATH = "data/new-data.csv"

### ETL

Load and preprocess current spreadsheet and new transactions:

In [None]:
sheet = SpreadSheet(SPREADSHEET_ID)
old_data = load_and_process_spreadsheet(sheet, col_mappings)
display(old_data)

In [None]:
new_data = load_and_process_transactions(TRANSACTIONS_PATH)
display(new_data)

Remove overlapping data that has already been added to the sheet:

In [None]:
latest_date = sheet.get_latest_col_value(col_mappings.get("date"))
new_data = new_data[new_data["Date"] >= latest_date]
new_data.rename(columns={"Name": 'description'}, inplace=True)
display(new_data)

### Compute embeddings and classify

In [None]:
threshold = 0.85
new_data_labeled, low_conf = embed_and_classify(old_data, new_data, "description", "category", threshold)

In [9]:
for index, name in low_conf.items():
    print(f"Item '{name}' at index {index} has low confidence and needs manual classification.")

### Update spreadsheet

In [None]:
display(new_data_labeled)

Reorder and format to match sheet.

In [None]:
import calendar 
new_data_labeled["month"] = pd.to_datetime(new_data_labeled["Date"]).dt.month.apply(lambda x: calendar.month_name[x])
cols = ['month', 'Date', 'Amount', 'description', 'category']
new_data_labeled = new_data_labeled[cols]
new_data_labeled.insert(1, 'blank', '')
display(new_data_labeled)

In [None]:
mo = col_mappings.get("month")
sheet.append_values(new_data_labeled.values.tolist(), f"I184:N184")