## Imports

In [11]:
# Install needed packages first:
# pip install langchain langchain-community openai pandas
# from langchain import LLMChain, PromptTemplate
# from langchain.chat_models import ChatOpenAI
# from langchain_core.output_parsers import StrOutputParser
# from langchain_core.prompts import PromptTemplate
# from langchain_openai import OpenAI

from langchain.tools import tool
from langchain.chat_models import init_chat_model

from langchain_community.document_loaders import PyPDFLoader

import pandas as pd
import json
import re
import numpy as np


from langchain.tools import tool
from langchain.agents import create_agent
from dotenv import load_dotenv
import os

load_dotenv()
    
os.environ["OPENAI_API_KEY"] = "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"

### Parser Tools

In [12]:
@tool
def parse_pdf(file_path: str) -> str:
    """Load PDF and return raw text."""
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    full_text = "\n".join([doc.page_content for doc in docs])
    return full_text

@tool
def parse_excel(files_path: str) -> str:
    """Load Excel and return JSON-string of rows."""
    df = pd.read_csv("statements\oct2025_csv.csv", header=None)
    info_df = df.iloc[:20]
    info_df = info_df.drop(columns=[2,3,4,5,6])
    info_text = info_df.to_string(index=False, header=False)

    # Clean the text:
    info_text = "\n".join(
        re.sub(r'\s+', ' ', line.strip())   
        for line in info_text.splitlines()
        if line.strip()                    
    )

    transaction_df = df.iloc[21:].copy() # data starts from row 21 downward
    headers = [str(col).strip() for col in df.iloc[20].tolist()]
    transaction_df.columns = headers

    transaction_df = transaction_df.drop(columns=["Value Date"])

    # Replace empty strings or whitespace-only cells with NaN
    transaction_df['Credit'] = transaction_df['Credit'].replace(r'^\s*$', np.nan, regex=True)
    transaction_df['Debit'] = transaction_df['Debit'].replace(r'^\s*$', np.nan, regex=True)

    transaction_df = transaction_df.rename(columns={"Txn Date": "Date", "Ref No./Cheque No.": "Reference Number"})

    transaction_df["Type"] = np.where(
        transaction_df['Credit'].notna(), 'Income',
        np.where(transaction_df['Debit'].notna(), 'Expense', 'Unknown')
    )

    trans_json = transaction_df.to_json(orient="records")

    return info_text, trans_json

### Agent

In [24]:
categorize_schema = {
    "type": "object",
    "description": "Structured output containing categorized bank transactions.",
    "properties": {
        "overview": {"type": "string", "description": (
            "Basic details of the statement file, including the account holder's name, "
            "account number, address, opening balance, and closing balance, etc. along with a brief overview of the account holder."
        )},
        "transactions": {
            "type": "array",
            "description": "List of categorized transactions.",
            "items": {
                "type": "object",
                "properties": {
                    "date": {"type": "string", "description": "Transaction date."},
                    "description": {"type": "string", "description": "Narration or details."},
                    "amount": {"type": "number", "description": "Positive for income, negative for expense."},
                    "type": {
                        "type": "string",
                        "enum": ["income", "expense"],
                        "description": "Transaction type."
                    },
                    "category": {
                        "type": "string",
                        "description": (
                            "For income: one of ['salary', 'investment redemption', 'dividend', 'interest', 'other']; "
                            "for expense: one of ['groceries', 'rent', 'food', 'EMI', 'investment', 'utilities', 'travel', "
                            "'entertainment', 'medical', 'other']."
                        )
                    }
                },
                "required": ["date", "description", "amount", "type", "category"]
            }
        },
        "insights": {"type": "string", "description": (
            "Insights on spending patterns, savings rate, unusual transactions, and "
            "personalized financial recommendations for the account holder."
        )}
    },
    "required": ["overview", "transactions", "insights"]
}



agent = create_agent(
    model="openai:gpt-5-nano",
    tools=[parse_excel, parse_pdf],
    system_prompt=(
        "You are a financial data assistant. You will be given either an Excel or PDF file path. Use the correct tool "
        "(parse_excel or parse_pdf) to extract transaction details. From the extracted data, identify all the transactions "
        "and output them in the specified response schema. Each transaction must include date, description, amount, type "
        "(“income” or “expense”), and a clear category. Use appropriate categories (salary, rent, groceries, utilities, etc.) "
        "and avoid using “other” unless absolutely necessary. Do not leave any transaction uncategorized or missing. Please do not leave out any transactions, give the categorised results for every transaction in the pdf."
    ),
    response_format=categorize_schema  # Auto-selects ProviderStrategy
)

result = agent.invoke({
    "messages": [{"role": "user", "content": "statements\statement 2.pdf"}]
})

result["structured_response"]

{'overview': 'Account Holder: Mr GOWTHAM RAMASAMY; Account No: 1259155000155390; Address: NO. 1111D/4, GR NAGAR, 7TH CROSS, NORTH GANDHIGRAMAM, SENAPIRATTI, PASUPATHIPALAYAM, KARUR, KARUR, TAMIL NADU 639004. Statement Period: 01/06/2025 to 30/06/2025. Opening Balance: 13,076.17. Total Credits: 1,770.00. Total Debits: 12,143.00. Closing Balance: 2,703.17. Transactions in the period: Debit (expense) and Credit (income) entries totaling 9 credits and 77 debits observed in the statement. The enclosed transactions include discretionary transfers to individuals, gaming and online services, education-related payments, and a number of smaller merchant payments, with several recurring transfers to IIST/HDFC, Vikraman Nair, Uday Bharath, and related entities. A few credits include Kalaivani P (income) and Akshay Krishnan (income) along with a Credit Interest of 22.00. Insights below summarize spending patterns and recommendations based on observed categories and counterparties.',
 'transactions'

In [25]:
print(result)

{'messages': [HumanMessage(content='statements\\statement 2.pdf', additional_kwargs={}, response_metadata={}, id='bbe348b7-407e-4e77-9604-f265fd8d28a2'), AIMessage(content='', additional_kwargs={'parsed': None, 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 222, 'prompt_tokens': 552, 'total_tokens': 774, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 192, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-5-nano-2025-08-07', 'system_fingerprint': None, 'id': 'chatcmpl-CZxVXf3XK7nDawvbEG4h9L33BoWEd', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--b17d711a-dd95-48dd-a41f-7b76b5f214ca-0', tool_calls=[{'name': 'parse_pdf', 'args': {'file_path': 'statements\\statement 2.pdf'}, 'id': 'call_RXVxygiAQrv8zA3VCJLiOANO', 'type': 'tool_call'}], usage_metadata={'input_tokens': 552,

In [26]:
head_results = result["structured_response"]["transactions"]
print("Total number of analysed transactions:",len(head_results))
for each_result in head_results:
    print(each_result)

Total number of analysed transactions: 83
{'date': '2025-06-01', 'description': 'RUMMYCIRCLE', 'amount': -25.0, 'type': 'expense', 'category': 'entertainment'}
{'date': '2025-06-01', 'description': 'II ST CANTEEN ACCO', 'amount': -352.0, 'type': 'expense', 'category': 'food'}
{'date': '2025-06-01', 'description': 'PLAY GAMES 24X7 PVT', 'amount': -100.0, 'type': 'expense', 'category': 'entertainment'}
{'date': '2025-06-02', 'description': 'INDIAN INSTITUTE OF', 'amount': -300.0, 'type': 'expense', 'category': 'education'}
{'date': '2025-06-02', 'description': 'VIKRAMAN NAIR K-FDRL', 'amount': -40.0, 'type': 'expense', 'category': 'other'}
{'date': '2025-06-03', 'description': 'VIKRAMAN NAIR K-FDRL', 'amount': -10.0, 'type': 'expense', 'category': 'other'}
{'date': '2025-06-04', 'description': 'Google India Digital', 'amount': -33.0, 'type': 'expense', 'category': 'entertainment'}
{'date': '2025-06-04', 'description': 'VIKRAMAN NAIR K-FDRL', 'amount': -18.0, 'type': 'expense', 'category'

In [27]:
print("Overview:", result["structured_response"]["overview"])
print("Insights:", result["structured_response"]["insights"])

Overview: Account Holder: Mr GOWTHAM RAMASAMY; Account No: 1259155000155390; Address: NO. 1111D/4, GR NAGAR, 7TH CROSS, NORTH GANDHIGRAMAM, SENAPIRATTI, PASUPATHIPALAYAM, KARUR, KARUR, TAMIL NADU 639004. Statement Period: 01/06/2025 to 30/06/2025. Opening Balance: 13,076.17. Total Credits: 1,770.00. Total Debits: 12,143.00. Closing Balance: 2,703.17. Transactions in the period: Debit (expense) and Credit (income) entries totaling 9 credits and 77 debits observed in the statement. The enclosed transactions include discretionary transfers to individuals, gaming and online services, education-related payments, and a number of smaller merchant payments, with several recurring transfers to IIST/HDFC, Vikraman Nair, Uday Bharath, and related entities. A few credits include Kalaivani P (income) and Akshay Krishnan (income) along with a Credit Interest of 22.00. Insights below summarize spending patterns and recommendations based on observed categories and counterparties.
Insights: Key spendin

In [28]:
out_df = pd.DataFrame(result["structured_response"]["transactions"])
out_df["date"] = pd.to_datetime(out_df["date"])

# Create a readable month label (e.g., "October 2025")
out_df["month"] = out_df["date"].dt.strftime("%B %Y")

out_df

Unnamed: 0,date,description,amount,type,category,month
0,2025-06-01,RUMMYCIRCLE,-25.0,expense,entertainment,June 2025
1,2025-06-01,II ST CANTEEN ACCO,-352.0,expense,food,June 2025
2,2025-06-01,PLAY GAMES 24X7 PVT,-100.0,expense,entertainment,June 2025
3,2025-06-02,INDIAN INSTITUTE OF,-300.0,expense,education,June 2025
4,2025-06-02,VIKRAMAN NAIR K-FDRL,-40.0,expense,other,June 2025
...,...,...,...,...,...,...
78,2025-06-30,IIST-HDFC-5020002786,-500.0,expense,education,June 2025
79,2025-06-30,VIKRAMAN NAIR K-FDRL,-23.0,expense,other,June 2025
80,2025-06-30,BINU D-YESB-00226110,-308.0,expense,shopping,June 2025
81,2025-06-30,DEVAYANI V S-IPOS-02,74.0,income,other,June 2025


#### Monthly summary of income, expenses and savings

In [29]:
# Then group and summarize
monthly_summary = (
    out_df.groupby(["month", "type"])["amount"]
      .sum()
      .unstack(fill_value=0)
      .reset_index()
      .rename_axis(None, axis=1)
)

monthly_summary["savings"] = monthly_summary.get("income", 0) + monthly_summary.get("expense", 0)

monthly_summary



Unnamed: 0,month,expense,income,savings
0,June 2025,-10528.0,1236.0,-9292.0


In [30]:
total_income = monthly_summary["income"].sum()
total_expense = monthly_summary["expense"].sum()
total_savings = monthly_summary["savings"].sum()


print("Total Income:", total_income)
print("Total Expense:", total_expense)
print("Total Savings:", total_savings)

Total Income: 1236.0
Total Expense: -10528.0
Total Savings: -9292.0


#### Category Summary

In [31]:
category_summary = (
    out_df.groupby(["category", "type"])["amount"]
      .sum()
      .unstack(fill_value=0)
      .reset_index()
      .rename_axis(None, axis=1)
)

category_summary

Unnamed: 0,category,expense,income
0,education,-4175.0,0.0
1,entertainment,-274.0,0.0
2,food,-352.0,0.0
3,interest,0.0,22.0
4,other,-3503.0,1014.0
5,salary,0.0,200.0
6,shopping,-2165.0,0.0
7,utilities,-59.0,0.0


## Experimenting

In [39]:
df = pd.read_csv("statements\oct2025_csv.csv", header=None)
info_df = df.iloc[:20]
info_df = info_df.drop(columns=[2,3,4,5,6])
info_text = info_df.to_string(index=False, header=False)

# Clean the text:
info_text = "\n".join(
    re.sub(r'\s+', ' ', line.strip())   # collapse multiple spaces → single space
    for line in info_text.splitlines()
    if line.strip()                     # skip blank lines
)
# You can then print it:
print(info_text)
# info_df

transaction_df = df.iloc[21:].copy()            # data starts from row 21 downward
headers = [str(col).strip() for col in df.iloc[20].tolist()]
transaction_df.columns = headers

transaction_df = transaction_df.drop(columns=["Value Date"])

# Replace empty strings or whitespace-only cells with NaN
transaction_df['Credit'] = transaction_df['Credit'].replace(r'^\s*$', np.nan, regex=True)
transaction_df['Debit'] = transaction_df['Debit'].replace(r'^\s*$', np.nan, regex=True)

transaction_df = transaction_df.rename(columns={"Txn Date": "Date", "Ref No./Cheque No.": "Reference Number"})

transaction_df["Type"] = np.where(
    transaction_df['Credit'].notna(), 'Income',
    np.where(transaction_df['Debit'].notna(), 'Expense', 'Unknown')
)
# transaction_df
trans_json = transaction_df.to_json(orient="records")

print(trans_json)

Account Name : Mr. Rohit Patnaik
Address : C/O Sasi Bhusan Patnaik, 00, WRITER STRE
ET
Bishamakatak-765019
396:Rayagada
Date : 28-Oct-25
Account Number : _00000040381454973
Account Description: LOTUS SAVING BANK-ADHAR- NCHQ
Branch : BISSAMCUTTACK
Drawing Power : 0
Interest Rate(% p.a.): 2.5
MOD Balance : 0
CIF No. : _90805421606
IFS (Indian Financial System) Code : SBIN0012100
CKYCR Number : XXXXXXXXXX1294
MICR (Magnetic Ink Character Recognition) Code : _765002501
Nomination Registered : No
Balance on 1 Oct 2025 : 8,413.30
Start Date : 01-Oct-25
End Date : 31-Oct-25
[{"Date":"01-Oct-25","Description":"   TO TRANSFER-UPI\/DR\/527449242189\/RAJENDRA\/YESB\/q737733319\/lassi--","Reference Number":"TRANSFER TO 4897693162093","Debit":"60","Credit":null,"Balance":"8,353.30","Type":"Expense"},{"Date":"01-Oct-25","Description":"   TO TRANSFER-UPI\/DR\/527457715352\/SABINA B\/YESB\/q72025260@\/bapa--","Reference Number":"TRANSFER TO 4897693162093","Debit":"400","Credit":null,"Balance":"7,953.3

In [22]:
def parse_pd2(file_path: str) -> str:
    """Load PDF and return raw text."""
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    full_text = "\n".join([doc.page_content for doc in docs])
    return full_text

In [23]:
full_text_parsed = parse_pd2("statements/statement 2.pdf")
print(full_text_parsed)

Account Statement
Mr GOWTHAM RAMASAMY
Acc.No.
:
1259155000155390
NO. 1111D/4, GR NAGAR, 7TH CROSS
NORTH GANDHIGRAMAM, SENAPIRATTI
PASUPATHIPALAYAM, KARUR
KARUR
TAMIL NADU
639004
25083874
SB - RESIDENT
01/07/2025
01/06/2025 to 30/06/2025
919597343176
gowtham3ramasamy@gmail.com
:
:
:
:
:
:
Customer ID
Acc.Type
St.Date
St.Period
Mobile No.
Email Id
Karur Vysya Bank,Central Office,Erode Road,Karur-639002(Tamilnadu) www.kvb.co.in
 
Account Summary
+
-
=
Opening Balance
Total Credit Amount
Total Debit Amount
Closing Balance
Count of Cr. & Dr.
Transactions
13,076.17
1,770.00
12,143.00
2,703.17
CR:9/DR:77
Statement of A/c 1259155000155390 for the period 01/06/2025 to 30/06/2025
Txn
Date
Value
Date
Brn
Code
Particulars
Ref. No
Debit
Credit
Balance
01/06/2025
01/06/2025
B/F...
13,076.17
01/06/2025
01/06/2025
1259
UPI-DR-551809971302-RUMMYCIRCLE-YESB-002
971302
25.00
13,051.17
01/06/2025
01/06/2025
1259
UPI-DR-515229095218-I I S T CANTEEN ACCO
095218
352.00
12,699.17
01/06/2025
01/06/2025
1259
UP