In [1]:
import os
import pdfplumber
import re
import pandas as pd

# --- Configuration ---
invoice_folder = "Invoices"
output_file = "financial_summary.xlsx"
processed_file_txt = "processed_files.txt"

In [2]:
# ------------------ Helper Functions ------------------

def ensure_pdf_extension(folder):
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        if os.path.isfile(file_path) and not filename.lower().endswith(".pdf"):
            try:
                with open(file_path, "rb") as f:
                    if f.read(5) == b"%PDF-":
                        os.rename(file_path, file_path + ".pdf")
                        print(f"Renamed '{filename}' → '{filename}.pdf'")
            except:
                continue

def rename_pdfs_to_standard(folder):
    """Rename any PDF with week number to HF_W<week>.pdf"""
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        if not os.path.isfile(file_path) or not filename.lower().endswith(".pdf"):
            continue

        # Skip files already correct
        if re.match(r"HF_W\d+\.pdf$", filename, re.IGNORECASE):
            continue

        # Extract week number
        week_match = re.search(r"W(\d+)", filename, re.IGNORECASE)
        if week_match:
            week_num = week_match.group(1)
            new_filename = f"HF_W{week_num}.pdf"
            new_path = os.path.join(folder, new_filename)

            # Avoid overwriting
            counter = 1
            while os.path.exists(new_path):
                new_filename = f"HF_W{week_num}_{counter}.pdf"
                new_path = os.path.join(folder, new_filename)
                counter += 1

            os.rename(file_path, new_path)
            print(f"Renamed '{filename}' → '{new_filename}'")

def extract_text(pattern, text):
    match = re.search(pattern, text)
    return match.group(1) if match else None

def extract_money(amount_str):
    if not amount_str:
        return 0.0
    return float(amount_str.replace("$","").replace(",",""))

def get_week_from_filename(filename):
    match = re.search(r"HF_W(\d+)", filename.upper())
    return match.group(1) if match else None

def extract_details_section(text):
    match = re.search(r"DETAILS(.*)", text, re.DOTALL)
    return match.group(1) if match else ""

def parse_statement_long(text, week_from_filename=None):
    transactions = []
    year = extract_text(r"Accounting Year\s+(\d{4})", text)
    period_match = re.search(
        r"Payment Period\s+(\d{2}/\d{2}/\d{4})\s*-\s*(\d{2}/\d{2}/\d{4})",
        text
    )
    period_start = period_match.group(1) if period_match else None
    period_end = period_match.group(2) if period_match else None
    details_text = extract_details_section(text)
    lines = details_text.split("\n")

    for line in lines:
        line = line.strip()
        if not line or line.startswith("Net Payment Total"):
            continue

        money_match = re.search(r"(-?\$[\d,]+\.\d{2})$", line)
        if not money_match:
            continue

        amount_str = money_match.group(1)
        amount = extract_money(amount_str)
        line_clean = line.replace(amount_str, "").strip()

        # ---------- Gross Sales ----------
        if line_clean.startswith("Gross Sales"):
            units_match = re.search(r"Gross Sales\s+(\d+)", line_clean)
            units = units_match.group(1) if units_match else "N/A"
            category = f"Gross Sales ({units} units)"
        elif line_clean.startswith("Net Sales"):
            category = "Net Sales"
        elif line_clean.startswith("Insurance"):
            category = "Insurance"
        elif line_clean.startswith("Credits"):
            category = "Credits"
        elif line_clean.startswith("Deductions"):
            category = line_clean.replace("Deductions", "").strip()
        else:
            category = line_clean

        transactions.append({
            "Week": week_from_filename,
            "Year": year,
            "Period Start": period_start,
            "Period End": period_end,
            "Category": category,
            "Amount": amount
        })

    return transactions

def parse_statement_summary(text, week_from_filename=None):
    summary_data = {"Week": week_from_filename}
    summary_text_match = re.search(r"SUMMARY(.*?)(DETAILS|$)", text, flags=re.DOTALL)
    summary_text = summary_text_match.group(1) if summary_text_match else text

    gross_sales = 0.0
    net_sales = 0.0
    credits = 0.0
    deductions_list = []
    net_payment = 0.0

    for line in summary_text.split("\n"):
        line = line.strip()
        if not line or line.startswith("Entry Type"):
            continue

        match = re.search(r"([\w\s:%]+)\s+(-?\$[\d,]+\.\d{2})$", line)
        if match:
            category = match.group(1).strip()
            amount = float(match.group(2).replace("$", "").replace(",", ""))
            if category.startswith("Gross Sales"):
                gross_sales = amount
            elif category.startswith("Net Sales"):
                net_sales = amount
            elif category.startswith("Credits"):
                credits = amount
            elif category.startswith("Net Payment Total"):
                net_payment = amount
            else:
                deductions_list.append(amount)

    summary_data["Gross Sales (Statement)"] = gross_sales
    summary_data["Net Sales (Statement)"] = net_sales
    summary_data["Credits (Statement)"] = credits
    summary_data["Deductions (Statement)"] = sum(deductions_list)
    summary_data["Net Payment Total (Statement)"] = net_payment

    return summary_data

In [3]:
# ------------------ Prep ------------------

ensure_pdf_extension(invoice_folder)
rename_pdfs_to_standard(invoice_folder)

# Load processed files list
if os.path.exists(processed_file_txt):
    with open(processed_file_txt, "r") as f:
        processed_files = set(line.strip() for line in f)
else:
    processed_files = set()

# ------------------ Detect New PDFs ------------------

all_pdfs = [f for f in os.listdir(invoice_folder) if f.lower().endswith(".pdf")]
new_pdfs = [f for f in all_pdfs if f not in processed_files]

if not new_pdfs:
    print("No new statements found. Nothing to update.")

else:
    print("New PDFs to process:", new_pdfs)

Renamed '000HF_W27.pdf' → 'HF_W27.pdf'
No new statements found. Nothing to update.


## Process new PDFs

In [5]:
all_transactions_new = []
statement_summary_new = []

for pdf_file in new_pdfs:
    week = get_week_from_filename(pdf_file)
    pdf_path = os.path.join(invoice_folder, pdf_file)

    with pdfplumber.open(pdf_path) as pdf:
        full_text = "\n".join(page.extract_text() or "" for page in pdf.pages)

    all_transactions_new.extend(
        parse_statement_long(full_text, week)
    )

    statement_summary_new.append(
        parse_statement_summary(full_text, week)
    )

df_details_new = pd.DataFrame(all_transactions_new)
df_details_new


In [None]:
df_statement_summary_new = pd.DataFrame(statement_summary_new)

## Compare Calculated vs Statement

In [None]:
summary_rows_new = []

for week, group in df_details_new.groupby("Week"):
    year = group["Year"].iloc[0]
    period_start = group["Period Start"].iloc[0]
    period_end = group["Period End"].iloc[0]

    net_sales = group.loc[group["Category"].str.contains("Net Sales"), "Amount"].sum()
    total_expenses = group.loc[group["Amount"] < 0, "Amount"].sum()
    net_profit = net_sales + total_expenses

    summary_rows_new.append({
        "Week": week,
        "Year": year,
        "Period Start": period_start,
        "Period End": period_end,
        "Net Sales (Calculated)": net_sales,
        "Total Expenses (Calculated)": total_expenses,
        "Net Profit (Calculated)": net_profit
    })

df_summary_calc_new = pd.DataFrame(summary_rows_new)


# ---------------- Compare Calculated vs Statement ----------------
df_compare_new = df_summary_calc_new.merge(
    df_statement_summary_new,
    on="Week",
    how="left"
)

df_compare_new["Net Profit Difference"] = (
    df_compare_new["Net Profit (Calculated)"] - df_compare_new["Net Payment Total (Statement)"]
)

print("Summary comparison for new PDFs:")
df_compare_new

## Append to Excel (prevent duplicates)

In [None]:
if os.path.exists(output_file):
    existing_details = pd.read_excel(output_file, sheet_name="Transaction Details")
    existing_summary = pd.read_excel(output_file, sheet_name="Summary Comparison")

    # Drop duplicates based on Week + Period
    df_details_combined = pd.concat(
        [existing_details, df_details_new]
    ).drop_duplicates(subset=["Week", "Period Start", "Period End"])

    df_compare_combined = pd.concat(
        [existing_summary, df_compare_new]
    ).drop_duplicates(subset=["Week", "Period Start", "Period End"])
else:
    df_details_combined = df_details_new
    df_compare_combined = df_compare_new


# Sort by date (Earliest -> latest)
df_details_combined = df_details_combined.sort_values( by=["Period Start"])

df_compare_combined = df_compare_combined.sort_values( by=["Period Start"])

# Write back

with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
    df_details_combined.to_excel(writer, sheet_name="Transaction Details", index=False)
    df_compare_combined.to_excel(writer, sheet_name="Summary Comparison", index=False)

print(f"Excel updated: {output_file}")

In [None]:
# ---------------- Update processed_files.txt ----------------
with open(processed_file_txt, "a") as f:
    for pdf in new_pdfs:
        f.write(pdf + "\n")

print("Processed files list updated.")