In [1]:
from dotenv import load_dotenv
import os
from langchain_groq import ChatGroq
import pandas as pd
import re

In [2]:
load_dotenv("secret.env")
api_key = os.getenv("GROQ_API_KEY")

In [3]:
# Initialize LLM
llm = ChatGroq(
    model="llama-3.1-70b-versatile",
    groq_api_key=api_key,
    temperature=0.9,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

In [4]:
# Prompt Engineering

from langchain_core.prompts import PromptTemplate

prompt_categorization = PromptTemplate.from_template(
    """
    You are a financial assistant. Your task is to categorize the following list of expenses into appropriate categories. 
    
    Instructions:
    1. Use the expenses provided.
    2. Categorize each expense based on the most relevant, commonly used categories. Assume that the expense name (such as a coffee shop or a retail store) reflects the nature of the transaction, even if it's not explicitly labeled.
       Commonly used categories:
        - Housing and Utilities: Include all the Housing and utilities such as rent, water bill, etc.
        - Transportation: Include all transportation-related expenses such as metro, taxis, etc.
        - Vehicle Maintenance: Include all car-related expenses such as car repairs, car washes, gas etc.
        - Food and Drinks: Combine any dining, coffee, restaurants, or snack-related expenses.
        - Groceries: Include supermarket and grocery store purchases.
        - Health and Wellness: Include expenses related to healthcare, pharmacies, gym memberships, etc.
        - Personal Care: Combine any beauty or personal grooming services like salons, spas, and barbershops.
        - Shopping: Combine all retail or online shopping expenses, including clothing, electronics, and household goods.
        - Entertainment: Include expenses for movies, streaming services, concerts, and any leisure activities.
        - Repairs and Maintenance: Include expenses related to any maintenance and repairs except the car such as locksmith, broken TV repair, etc.
        - Subscriptions and Fees: Include all the fees and subscription charges.
    3. If the category is unclear from the name, use your best judgment to place the expense into the most suitable existing category. If nothing is suitable then put the expense under Miscellaneous.
    4. Include the full details of each expense (date, description, and amount) in your categorization.
    
    Format your response like this:
    Category Name:
    - Date - Description - Amount

    Expenses to categorize:
    {expenses}

    """
)


In [5]:
# Read csv of monthly expenses
df = pd.read_csv("monthly-expenses.csv")
df.head()

Unnamed: 0,Date,Description,Amount (EUR)
0,2024-09-01,Rent,1200.0
1,2024-09-01,Electricity Bill,75.5
2,2024-09-01,Water Bill,30.0
3,2024-09-01,Internet Service,45.0
4,2024-09-01,Mobile Phone Plan,35.0


In [6]:
'''
For each row, it formats the Date, Description, and Amount (EUR) into a single string with hyphens so that, LLM can more easily interpret them.
['2024-09-01 - Rent - 1200.0 EUR',....]
'''

def format_expenses(expenses):
    formatted_expenses = []
    for _, row in expenses.iterrows():
        formatted_expenses.append(f"{row['Date']} - {row['Description']} - {row['Amount (EUR)']} EUR")
    return formatted_expenses


In [7]:
'''
converts the list of formatted expenses into a single text string seperated by ',' 
expense_text: 2024-09-01 - Rent - 1200.0 EUR, 2024-09-01 - Electricity Bill - 75.5 EUR,...
Sends expense_text to LLM
'''

def categorize_expenses(expense_list):
    expense_text = ', '.join(expense_list)
    chain_categorization = prompt_categorization | llm
    response = chain_categorization.invoke({"expenses": expense_text})
    return response.content


In [8]:
# Defining chunk size due to context limit
chunk_size=50
# Empty list to append results of each chunk batch
categorized_expenses=[]

In [9]:
'''
Loop over the batches of chunks, send each batch to LLm and concatenate the response
'''

for i in range(0, len(df), chunk_size):
    # Select the current batch of expenses
    current_expenses = df.iloc[i:i+chunk_size]
    
    formatted_expenses = format_expenses(current_expenses)
    categorized_response = categorize_expenses(formatted_expenses)
    categorized_expenses.append(categorized_response)
    
print("....categorized_expenses (final list).....",categorized_expenses)
    

....categorized_expenses (final list)..... ['Housing and Utilities:\n- 2024-09-01 - Rent - 1200.0 EUR\n- 2024-09-01 - Electricity Bill - 75.5 EUR\n- 2024-09-01 - Water Bill - 30.0 EUR\n- 2024-09-01 - Internet Service - 45.0 EUR\n- 2024-09-01 - Home Insurance - 58.33 EUR\n\nTransportation:\n- 2024-09-02 - Metro Ticket - 2.8 EUR\n- 2024-09-03 - Metro Ticket - 2.8 EUR\n- 2024-09-04 - Metro Ticket - 2.8 EUR\n- 2024-09-05 - Metro Ticket - 2.8 EUR\n- 2024-09-06 - Metro Ticket - 2.8 EUR\n- 2024-09-07 - Metro Ticket - 2.8 EUR\n- 2024-09-09 - Metro Ticket - 2.8 EUR\n- 2024-09-10 - Metro Ticket - 2.8 EUR\n- 2024-09-11 - Metro Ticket - 2.8 EUR\n\nVehicle Maintenance:\n- 2024-09-07 - Gas Station - 65.0 EUR\n- 2024-09-01 - Car Insurance - 83.33 EUR\n\nFood and Drinks:\n- 2024-09-02 - Café Noir - 3.5 EUR\n- 2024-09-02 - Work Cafeteria - 8.5 EUR\n- 2024-09-03 - Bakery Deluxe - 4.75 EUR\n- 2024-09-03 - Veggie Corner - 11.5 EUR\n- 2024-09-04 - Café Noir - 3.5 EUR\n- 2024-09-04 - Pasta Paradise - 13.75 

In [10]:
'''
Postprocessing of the response, creates a tuple with (date, description, category, amount) for every row, 
stores all tuples in the list.
'''
def extract_data(text):
    entries = []
    for block in text.split("\n\n"):
        print()
        category, *expenses = block.split("\n")
        for expense in expenses:
            match = re.match(r"- (\d{4}-\d{2}-\d{2}) - (.*?) - ([\d.]+) EUR", expense.strip())
            if match:
                date, desc, amount = match.groups()
                entries.append((date, desc, category.strip(":"), float(amount)))
    return entries


In [11]:
# Extract data from all response batches
all_entries = []

for text in categorized_expenses:
    all_entries.extend(extract_data(text))

# Create a DataFrame from the extracted data
df_expenses = pd.DataFrame(all_entries, columns=["Date", "Expense", "Category", "Amount"])
# Format the category names
df_expenses["Category"] = df_expenses["Category"].apply(lambda x: x.title())

















































In [12]:
unique_categories = df_expenses["Category"].unique()
print(unique_categories)

['Housing And Utilities' 'Transportation' 'Vehicle Maintenance'
 'Food And Drinks' 'Groceries' 'Health And Wellness' 'Shopping'
 'Entertainment' 'Subscriptions And Fees' 'Miscellaneous' 'Personal Care'
 'Repairs And Maintenance']


In [13]:
def standardize_categories(Category):
    return Category.strip("**").strip(":").strip()

    
    '''
    # Define mappings of similar categories to unified names (without prompt engineering)
    if "food" in category or "dining" in category or in category or "snacks" in category or "beverages" in category:
        return "Food and Drinks"
    elif "transportation" in category or "vehicle" in category:
        return "Transportation"
    elif "entertainment" in category or "recreation" in category:
        return "Entertainment"
    elif "housing" in category or "bill" in category or "utilities" in category or "home" in category:
        return "Housing and Utilities"
    elif "health" in category or "wellness" in category or "pets" in category:
        return "Health and Wellness"
    elif "shopping" in category or "miscellaneous" in category or "non-food" in category:
        return "Shopping"
    elif "finance" in category or "banking" in category or "insurance" in category or "fees" in category or "technology" in category "digital" in category:
        return "Subscription and Fees"
    elif "personal" in category or "care" in category or "services" in category or "beauty" in category:
        return "Personal Care"
    elif "office" in category or "education" in category or "school" in category:
        return "Education"
    elif "garden" in category or "home" in category or "repair" in category:
        return "Maintenance and Repair"
    else:
        return category
    '''


    

In [15]:
# Apply the standardization to the 'category' column
df_expenses['Category'] = df_expenses['Category'].apply(standardize_categories)
print(df_expenses)
df_expenses.head()



           Date                       Expense               Category   Amount
0    2024-09-01                          Rent  Housing And Utilities  1200.00
1    2024-09-01              Electricity Bill  Housing And Utilities    75.50
2    2024-09-01                    Water Bill  Housing And Utilities    30.00
3    2024-09-01              Internet Service  Housing And Utilities    45.00
4    2024-09-01                Home Insurance  Housing And Utilities    58.33
..          ...                           ...                    ...      ...
240  2024-09-08                    Oil Change    Vehicle Maintenance    55.00
241  2024-09-13                 Car Detailing    Vehicle Maintenance    80.00
242  2024-09-18  Windshield Wiper Replacement    Vehicle Maintenance    25.00
243  2024-09-23           Battery Replacement    Vehicle Maintenance   120.00
244  2024-09-28         Brake Pad Replacement    Vehicle Maintenance   150.00

[245 rows x 4 columns]


Unnamed: 0,Date,Expense,Category,Amount
0,2024-09-01,Rent,Housing And Utilities,1200.0
1,2024-09-01,Electricity Bill,Housing And Utilities,75.5
2,2024-09-01,Water Bill,Housing And Utilities,30.0
3,2024-09-01,Internet Service,Housing And Utilities,45.0
4,2024-09-01,Home Insurance,Housing And Utilities,58.33
