## Imports

In [5]:
from langchain.tools import tool
from langchain.chat_models import init_chat_model

# For PDF loading
from langchain_community.document_loaders import PyPDFLoader

import pandas as pd
import json
import re
import numpy as np

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from langchain.tools import tool
from dotenv import load_dotenv
import os

### Parser Tools

In [2]:
@tool
def parse_pdf(file_path: str) -> str:
    """Load PDF and return raw text."""
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    full_text = "\n".join([doc.page_content for doc in docs])
    return full_text

@tool
def parse_excel(files_path: str) -> str:
    """Load Excel and return JSON-string of rows."""
    df = pd.read_csv("statements\oct2025_csv.csv", header=None)
    info_df = df.iloc[:20]
    info_df = info_df.drop(columns=[2,3,4,5,6])
    info_text = info_df.to_string(index=False, header=False)

    # Clean the text:
    info_text = "\n".join(
        re.sub(r'\s+', ' ', line.strip())   
        for line in info_text.splitlines()
        if line.strip()                    
    )

    transaction_df = df.iloc[21:].copy() # data starts from row 21 downward
    headers = [str(col).strip() for col in df.iloc[20].tolist()]
    transaction_df.columns = headers

    transaction_df = transaction_df.drop(columns=["Value Date"])

    # Replace empty strings or whitespace-only cells with NaN
    transaction_df['Credit'] = transaction_df['Credit'].replace(r'^\s*$', np.nan, regex=True)
    transaction_df['Debit'] = transaction_df['Debit'].replace(r'^\s*$', np.nan, regex=True)

    transaction_df = transaction_df.rename(columns={"Txn Date": "Date", "Ref No./Cheque No.": "Reference Number"})

    transaction_df["Type"] = np.where(
        transaction_df['Credit'].notna(), 'Income',
        np.where(transaction_df['Debit'].notna(), 'Expense', 'Unknown')
    )

    trans_json = transaction_df.to_json(orient="records")

    return info_text, trans_json

### Experimenting

In [39]:
df = pd.read_csv("statements\oct2025_csv.csv", header=None)
info_df = df.iloc[:20]
info_df = info_df.drop(columns=[2,3,4,5,6])
info_text = info_df.to_string(index=False, header=False)

# Clean the text:
info_text = "\n".join(
    re.sub(r'\s+', ' ', line.strip())   # collapse multiple spaces â†’ single space
    for line in info_text.splitlines()
    if line.strip()                     # skip blank lines
)
# You can then print it:
print(info_text)
# info_df

transaction_df = df.iloc[21:].copy()            # data starts from row 21 downward
headers = [str(col).strip() for col in df.iloc[20].tolist()]
transaction_df.columns = headers

transaction_df = transaction_df.drop(columns=["Value Date"])

# Replace empty strings or whitespace-only cells with NaN
transaction_df['Credit'] = transaction_df['Credit'].replace(r'^\s*$', np.nan, regex=True)
transaction_df['Debit'] = transaction_df['Debit'].replace(r'^\s*$', np.nan, regex=True)

transaction_df = transaction_df.rename(columns={"Txn Date": "Date", "Ref No./Cheque No.": "Reference Number"})

transaction_df["Type"] = np.where(
    transaction_df['Credit'].notna(), 'Income',
    np.where(transaction_df['Debit'].notna(), 'Expense', 'Unknown')
)
# transaction_df
trans_json = transaction_df.to_json(orient="records")

print(trans_json)

Account Name : Mr. Rohit Patnaik
Address : C/O Sasi Bhusan Patnaik, 00, WRITER STRE
ET
Bishamakatak-765019
396:Rayagada
Date : 28-Oct-25
Account Number : _00000040381454973
Account Description: LOTUS SAVING BANK-ADHAR- NCHQ
Branch : BISSAMCUTTACK
Drawing Power : 0
Interest Rate(% p.a.): 2.5
MOD Balance : 0
CIF No. : _90805421606
IFS (Indian Financial System) Code : SBIN0012100
CKYCR Number : XXXXXXXXXX1294
MICR (Magnetic Ink Character Recognition) Code : _765002501
Nomination Registered : No
Balance on 1 Oct 2025 : 8,413.30
Start Date : 01-Oct-25
End Date : 31-Oct-25
[{"Date":"01-Oct-25","Description":"   TO TRANSFER-UPI\/DR\/527449242189\/RAJENDRA\/YESB\/q737733319\/lassi--","Reference Number":"TRANSFER TO 4897693162093","Debit":"60","Credit":null,"Balance":"8,353.30","Type":"Expense"},{"Date":"01-Oct-25","Description":"   TO TRANSFER-UPI\/DR\/527457715352\/SABINA B\/YESB\/q72025260@\/bapa--","Reference Number":"TRANSFER TO 4897693162093","Debit":"400","Credit":null,"Balance":"7,953.3

### Agent

In [7]:
from langchain.agents import create_agent


categorize_schema = {
    "type": "object",
    "description": "Structured output containing categorized bank transactions.",
    "properties": {
        "transactions": {
            "type": "array",
            "description": "List of categorized transactions.",
            "items": {
                "type": "object",
                "properties": {
                    "date": {"type": "string", "description": "Transaction date."},
                    "description": {"type": "string", "description": "Narration or details."},
                    "amount": {"type": "number", "description": "Positive for income, negative for expense."},
                    "type": {
                        "type": "string",
                        "enum": ["income", "expense"],
                        "description": "Transaction type."
                    },
                    "category": {
                        "type": "string",
                        "description": (
                            "For income: one of ['salary', 'investment redemption', 'dividend', 'interest', 'other']; "
                            "for expense: one of ['groceries', 'rent', 'investment', 'utilities', 'travel', "
                            "'entertainment', 'medical', 'other']."
                        )
                    }
                },
                "required": ["date", "description", "amount", "type", "category"]
            }
        }
    },
    "required": ["transactions"]
}

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

agent = create_agent(
    model="openai:gpt-5-nano",
    tools=[parse_excel],
    system_prompt="You are a financial assistant. You will be provided an excel_sheet path, use the parse_excel tool to extract the details and categorize the income and expenses to the relevant categories and repond according to the response schema for the first 10 entries of transactions. Ensure that none of the first 10 transactions are left uncategorized or unreported",
    response_format=categorize_schema  # Auto-selects ProviderStrategy
)

result = agent.invoke({
    "messages": [{"role": "user", "content": "statements\oct2025_csv.csv"}]
})

result["structured_response"]

{'transactions': [{'date': '01-Oct-25',
   'description': 'TO TRANSFER-UPI/DR/527449242189/RAJENDRA/YESB/q737733319/lassi--',
   'amount': -60,
   'type': 'expense',
   'category': 'groceries'},
  {'date': '01-Oct-25',
   'description': 'TO TRANSFER-UPI/DR/527457715352/SABINA B/YESB/q72025260@/bapa--',
   'amount': -400,
   'type': 'expense',
   'category': 'other'},
  {'date': '01-Oct-25',
   'description': 'BY TRANSFER-UPI/CR/377580263844/SHASHIBH/SBIN/patnaiksas/Payme--',
   'amount': 400,
   'type': 'income',
   'category': 'other'},
  {'date': '01-Oct-25',
   'description': 'TO TRANSFER-UPI/DR/564058445013/PRABHAKA/SBIN/7008867539/soup--',
   'amount': -30,
   'type': 'expense',
   'category': 'groceries'},
  {'date': '01-Oct-25',
   'description': 'TO TRANSFER-UPI/DR/527460090795/Mrs  DAK/YESB/q465147215/petti--',
   'amount': -30,
   'type': 'expense',
   'category': 'other'},
  {'date': '01-Oct-25',
   'description': 'TO TRANSFER-UPI/DR/527460702563/RAMAKANT/YESB/q655172926/han

In [8]:
head_results = result["structured_response"]["transactions"]
print("Total number of analysed transactions:",len(head_results))
for each_result in head_results:
    print(each_result)

Total number of analysed transactions: 10
{'date': '01-Oct-25', 'description': 'TO TRANSFER-UPI/DR/527449242189/RAJENDRA/YESB/q737733319/lassi--', 'amount': -60, 'type': 'expense', 'category': 'groceries'}
{'date': '01-Oct-25', 'description': 'TO TRANSFER-UPI/DR/527457715352/SABINA B/YESB/q72025260@/bapa--', 'amount': -400, 'type': 'expense', 'category': 'other'}
{'date': '01-Oct-25', 'description': 'BY TRANSFER-UPI/CR/377580263844/SHASHIBH/SBIN/patnaiksas/Payme--', 'amount': 400, 'type': 'income', 'category': 'other'}
{'date': '01-Oct-25', 'description': 'TO TRANSFER-UPI/DR/564058445013/PRABHAKA/SBIN/7008867539/soup--', 'amount': -30, 'type': 'expense', 'category': 'groceries'}
{'date': '01-Oct-25', 'description': 'TO TRANSFER-UPI/DR/527460090795/Mrs  DAK/YESB/q465147215/petti--', 'amount': -30, 'type': 'expense', 'category': 'other'}
{'date': '01-Oct-25', 'description': 'TO TRANSFER-UPI/DR/527460702563/RAMAKANT/YESB/q655172926/hanki--', 'amount': -30, 'type': 'expense', 'category': '