In [3]:
import pdfplumber
import json
import re
from collections import defaultdict
import numpy as np



def extract_words(pdf_path):
    words = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_words = page.extract_words()
            words.extend(page_words)
    return words


def group_rows(words, tolerance=5):
    rows = defaultdict(list)

    for word in words:
        y_key = round(word['top'] / tolerance) * tolerance
        rows[y_key].append(word)

    return rows

def sort_rows(rows):
    sorted_rows = []

    for y in sorted(rows.keys()):
        row = sorted(rows[y], key=lambda w: w['x0'])
        sorted_rows.append(row)

    return sorted_rows


def detect_columns(rows, threshold=40):
    x_positions = []

    for row in rows:
        for word in row:
            x_positions.append(word['x0'])

    x_positions = sorted(x_positions)

    columns = []
    for x in x_positions:
        if not columns:
            columns.append([x])
        elif abs(x - columns[-1][-1]) < threshold:
            columns[-1].append(x)
        else:
            columns.append([x])

    column_centers = [sum(col) / len(col) for col in columns]

    return column_centers



def build_table(rows, column_centers):
    table = []

    for row in rows:
        row_data = [""] * len(column_centers)

        for word in row:
            distances = [abs(word['x0'] - c) for c in column_centers]
            col_index = distances.index(min(distances))
            row_data[col_index] += word['text'] + " "

        cleaned_row = [cell.strip() for cell in row_data]

        if any(cell != "" for cell in cleaned_row):
            table.append(cleaned_row)

    return table


def extract_invoice_fields(full_text):
    invoice_number = re.search(r'Invoice\s*(No|Number)?[:\-]?\s*(\S+)', full_text, re.IGNORECASE)
    date = re.search(r'Date[:\-]?\s*([\d\-\/\.]+)', full_text, re.IGNORECASE)
    total = re.search(r'Total\s*Amount?[:\-]?\s*([\d,\.]+)', full_text, re.IGNORECASE)

    return {
        "invoice_number": invoice_number.group(2) if invoice_number else None,
        "date": date.group(1) if date else None,
        "total_amount": float(total.group(1).replace(",", "")) if total else None
    }


def filter_table_rows(table):
    filtered = []

    for row in table:
        if any(re.search(r'\d', cell) for cell in row):
            filtered.append(row)

    return filtered



def run_pipeline(pdf_path, output_json="output.json"):

    print("Extracting words...")
    words = extract_words(pdf_path)

    print("Grouping rows...")
    rows = group_rows(words)

    print("Sorting rows...")
    sorted_rows = sort_rows(rows)

    print("Detecting columns...")
    column_centers = detect_columns(sorted_rows)

    print("Building table...")
    table = build_table(sorted_rows, column_centers)

    print("Filtering table rows...")
    table = filter_table_rows(table)

    print("Extracting header fields...")
    full_text = " ".join([w['text'] for w in words])
    fields = extract_invoice_fields(full_text)

    result = {
        **fields,
        "line_items": table
    }

    print("Saving JSON output...")
    with open(output_json, "w") as f:
        json.dump(result, f, indent=4)

    print("Done! Output saved to", output_json)



if __name__ == "__main__":
    pdf_path = "sample_invoice.pdf"
    run_pipeline(pdf_path)


Extracting words...
Grouping rows...
Sorting rows...
Detecting columns...
Building table...
Filtering table rows...
Extracting header fields...
Saving JSON output...
Done! Output saved to output.json
