In [1]:
import fitz  # PyMuPDF
import pandas as pd
import re

In [2]:
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF."""
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text("text") for page in doc)

In [3]:
def extract_transactions(text):
    """Extracts transactions using regex pattern matching."""
    transaction_pattern = re.compile(
        r"(\d{2}-[A-Za-z]{3}-\d{4})\s+([TC])\s+(.*?)\s+([\d,]+\.\d{2})\s+([\d,]+\.\d{2}[A-Za-z]*)"
    )
    
    transactions = []
    for match in transaction_pattern.findall(text):
        date, txn_type, desc, amount, balance = match
        transactions.append([date, txn_type, desc.strip(), amount, balance])

    return transactions

In [4]:
def save_to_excel(transactions, output_path):
    """Saves extracted transactions to an Excel file."""
    df = pd.DataFrame(transactions, columns=["Date", "Type", "Description", "Amount", "Balance"])
    df.to_excel(output_path, index=False)
    print(f"Transactions saved to {output_path}")

In [5]:
def process_pdf_to_excel(pdf_path, output_path):
    """Extracts transactions from PDF and saves them to Excel."""
    text = extract_text_from_pdf(pdf_path)
    transactions = extract_transactions(text)
    
    if transactions:
        save_to_excel(transactions, output_path)
    else:
        print("⚠️ No transactions found. Check regex or PDF format.")

In [6]:
pdf_path = "test3.pdf"
output_path = "output.xlsx"

process_pdf_to_excel(pdf_path, output_path)

Transactions saved to output.xlsx
