In [9]:
import fitz  # PyMuPDF
import re
from datetime import datetime

def extract_text_lines(filepath):
    doc = fitz.open(filepath)
    all_text = ""
    for page in doc:
        all_text += page.get_text()
    lines = all_text.split("\n")
    return lines

def group_transaction_blocks(lines):
    blocks = []
    current_block = []
    
    for line in lines:
        if re.match(r"\d{2} \w{3} \d{2}", line):  # Looks like "07 Dec 20"
            if current_block:
                blocks.append(" ".join(current_block))
                current_block = []
        current_block.append(line.strip())

    if current_block:
        blocks.append(" ".join(current_block))

    return blocks

def parse_transaction_blocks(blocks):
    transactions = []
    for block in blocks:
        # Extract date
        date_match = re.search(r"(\d{2} \w{3} \d{2})", block)
        amount_match = re.search(r"(\d{1,3}(?:,\d{3})*(?:\.\d{2}))", block)

        if date_match and amount_match:
            try:
                date = datetime.strptime(date_match.group(1), "%d %b %y").strftime("%Y-%m-%d")
                amount = float(amount_match.group(1).replace(",", ""))
                description = block[:100]  # first 100 chars of block
                tx_type = "Credit" if "RETURN" in block.upper() else "Debit"

                transactions.append({
                    "date": date,
                    "description": description,
                    "amount": amount,
                    "type": tx_type
                })
            except Exception as e:
                print(f"Skipping block due to error: {e}")

    return transactions


In [10]:
file_path = r"D:\UPI_Analyzer\Data\eStatement.pdf"
lines = extract_text_lines(file_path)
blocks = group_transaction_blocks(lines)
txns = parse_transaction_blocks(blocks)

for txn in txns:
    print(txn)


{'date': '2020-12-01', 'description': '01 Dec 20 01 Dec 20 BALANCE FORWARD 264.19   UPI/033622092936/ 027601517870/ANBARASUMAHI@OKICICI/ M ', 'amount': 264.19, 'type': 'Debit'}
{'date': '2020-12-02', 'description': '02 Dec 20 02 Dec 20 UPI/033715024645/ 1660155000036143/V.SUGANYABTECHIT@OKSBI/ SUGANYA V/KVBL0001660', 'amount': 1200.0, 'type': 'Credit'}
{'date': '2020-12-03', 'description': '03 Dec 20 02 Dec 20 IMPS P2A CHARGES 021220   3.55 0.64   CGST @ 9.00%   0.32 0.32   SGST @ 9.00%   ', 'amount': 3.55, 'type': 'Debit'}
{'date': '2020-12-07', 'description': '07 Dec 20 06 Dec 20 UPI/034118357655/ 027601517870/ANBARASUMAHI@OKICICI/ M ANBARASU/ICIC0000276/UPI/', 'amount': 2.0, 'type': 'Debit'}
{'date': '2020-12-07', 'description': '07 Dec 20 07 Dec 20 UPI/034201041227/ 917020028084740/GOOG-PAYMENT@OKAXIS/ GOOGLEPAY/UTIB0000553/UPI', 'amount': 6.0, 'type': 'Debit'}
{'date': '2020-12-07', 'description': '07 Dec 20 07 Dec 20 BALANCE FORWARD 6.29', 'amount': 6.29, 'type': 'Debit'}
{'date'

In [11]:
txn["description"] = re.sub(r'\d{2} \w{3} \d{2}', '', txn["description"])  # remove date from description
txn["description"] = txn["description"].strip()

In [12]:
def categorize_transaction(description):
    desc = description.lower()
    if "zomato" in desc or "swiggy" in desc:
        return "Food & Dining"
    elif "googlepay" in desc or "paytm" in desc or "upi" in desc:
        return "UPI Payment"
    elif "imps" in desc or "transfer" in desc:
        return "Bank Transfer"
    elif "interest" in desc:
        return "Income"
    elif "recharge" in desc or "electricity" in desc:
        return "Utilities"
    elif "amazon" in desc or "flipkart" in desc:
        return "Shopping"
    else:
        return "Others"


In [17]:
def get_llm_insight(transactions_summary):
    openai.api_key = st.secrets["OPENAI_API_KEY"]
    prompt = f"""Analyze this transaction summary:\n{transactions_summary}\n
    Give monthly savings %, unnecessary spending, and 3 smart tips for better finance management."""
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}]
    )
    return response['choices'][0]['message']['content']

In [None]:
for txn in txns:
    txn["category"] = categorize_transaction(txn["description"])


{'date': '2020-12-31', 'description': 'SAVING A/C CREDIT INTEREST  11.00  4,649.29 TOTAL 14,834.00 10,448.90 4,649.29 R', 'amount': 11.0, 'type': 'Debit', 'category': 'Income'}


In [15]:
import pandas as pd

df = pd.DataFrame(txns)
df.to_csv("parsed_transactions.csv", index=False)
print("✅ Saved to parsed_transactions.csv")

✅ Saved to parsed_transactions.csv
