In [46]:
pip install camelot-py[cv]





[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import os
import re
import json
import zipfile
import camelot
from PyPDF2 import PdfReader
from io import BytesIO
import pandas as pd

# Path to the ZIP file
zip_file_path = r"Invoice_dec.zip"

# Initialize a dictionary to hold extracted invoice data
invoice_data = {"invoices": []}

# Step 1: Open the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Get a list of PDF files in the ZIP archive
    pdf_files = [f for f in zip_ref.namelist() if f.endswith('.pdf')]

    if not pdf_files:
        raise FileNotFoundError("No PDF files found in the ZIP file.")

    # Function to process a single PDF file
    def process_pdf(pdf_file):
        try:
            with zip_ref.open(pdf_file) as file:
                pdf_data = file.read()
                pdf_bytes = BytesIO(pdf_data)

                # Extract tables using Camelot with lattice flavor for better table detection
                tables = camelot.read_pdf(pdf_bytes, pages='all', flavor='lattice')
                purchase_data = []

                if len(tables) > 0 and tables[0].df.shape[0] > 1:
                    table = tables[0].df
                    table = pd.DataFrame(table)

                    # Clean column headers and set them
                    table.columns = [str(col).strip() for col in table.iloc[0]]
                    table = table[1:].reset_index(drop=True)

                    # Remove empty rows
                    table = table[table[table.columns[0]].astype(str).str.strip().astype(bool)]
                    table = table.reset_index(drop=True)

                    # Dynamically map columns
                    column_mapping = {}
                    for col in table.columns:
                        col_lower = str(col).lower()
                        if "description" in col_lower:
                            column_mapping[col] = "DESCRIPTION"
                        elif "qty" in col_lower or "quantity" in col_lower:
                            column_mapping[col] = "QTY"
                        elif "unit price" in col_lower or "price" in col_lower:
                            column_mapping[col] = "UNIT PRICE"
                        elif "total" in col_lower:
                            column_mapping[col] = "TOTAL"
                        elif re.match(r"^\d+$", str(col)):
                            column_mapping[col] = "No"

                    table.rename(columns=column_mapping, inplace=True)
                    print(f"Processing {pdf_file} - Table Data:\n{table}")  # Debug output

                    # Helper functions
                    def clean_text(value):
                        return str(value).strip()

                    def clean_numeric(value):
                        cleaned = str(value).replace(",", "").replace(" ", "").strip()
                        return float(cleaned) if cleaned.replace(".", "", 1).isdigit() else 0.0

                    # Process rows
                    for _, row in table.iterrows():
                        try:
                            if not str(row.get("No", "")).strip().isdigit():
                                continue

                            # Extract description and category
                            description = clean_text(row.get("DESCRIPTION", ""))
                            category = "Unknown"
                            if "-" in description:
                                parts = description.split("-", 1)  # Split only once
                                description = parts[0].strip()
                                category = parts[1].strip() if len(parts) > 1 else "Unknown"

                            # Extract numeric values
                            qty = clean_numeric(row.get("QTY", 0))
                            unit_price = clean_numeric(row.get("UNIT PRICE", 0))
                            total = qty * unit_price

                            purchase_data.append({
                                "description": description,
                                "quantity": qty,
                                "unit_price": unit_price,
                                "total": total,
                                "category": category
                            })
                        except Exception as e:
                            print(f"Row error: {e}")

                # Extract text for invoice metadata
                reader = PdfReader(pdf_bytes)
                pdf_text = " ".join(page.extract_text() or "" for page in reader.pages)

                # Regex patterns for invoice details
                patterns = {
                    "invoice_number": r"INVOICE\s*NO\.?\s*(\d+)",
                    "sub_total": r"SUBTOTAL\s+([\d,\. ]+)",
                    "discount": r"DISCOUNT\s+([\d,\. ]+)",
                    "tax_rate": r"TAX RATE\s+([\d\.]+)%",
                    "total_tax": r"TOTAL TAX\s+([\d,\. ]+)",
                    "total_amount": r"Total\s*â‚¹\s*([\d,\. ]+)"
                }

                extracted = {}
                for key, pattern in patterns.items():
                    match = re.search(pattern, pdf_text.replace(",", "").replace(" ", ""))
                    if match:
                        extracted[key] = float(match.group(1).replace(",", "").strip()) if match else 0.0
                    else:
                        extracted[key] = 0.0 if "amount" in key or "total" in key else "Unknown"

                # Build invoice details
                invoice_details = {
                    "sub_total": extracted["sub_total"],
                    "discount": extracted["discount"],
                    "subtotal_less_discount": float(extracted["sub_total"]) - float(extracted["discount"]),
                    "tax_rate": f"{extracted['tax_rate']}%",
                    "total_tax": extracted["total_tax"],
                    "total_amount": extracted["total_amount"]
                }

                return {
                    "invoice_number": extracted.get("invoice_number", "Unknown"),
                    "purchase_details": purchase_data,
                    "invoice_details": invoice_details,
                    "customer": {"name": "Unknown"}
                }

        except Exception as e:
            print(f"Failed to process {pdf_file}: {e}")
            return None

    # Process all PDFs
    for pdf_file in pdf_files:
        invoice = process_pdf(pdf_file)
        if invoice:
            invoice_data["invoices"].append(invoice)

# Save to JSON
output_json_file = "output.json"
with open(output_json_file, 'w') as f:
    json.dump(invoice_data, f, indent=4)

print(f"Data saved to {output_json_file}")

Processing INV_2.pdf - Table Data:
  DESCRIPTION                                                                 \
0  1\n2\n3\n4  Levis - Tshirt\nOtto White -Shirt\nCargo - tra...  2\n2\n1\n1   

                                                                              
0  1150.00\n750.00\n600.00\n1150.00  0.00\n2300.00\n1500.00\n600.00\n1150.00  
Failed to process INV_2.pdf: could not convert string to float: 'Unknown'
Processing INV_3.pdf - Table Data:
  DESCRIPTION                                                              \
0     1\n2\n3  Levis - Jeans\nBlack T-Shirt\nAddidas Yellow S...  2\n2\n1   

                                                               
0  1200.00\n1500.00\n3000.00  0.00\n2400.00\n3000.00\n3000.00  
Failed to process INV_3.pdf: could not convert string to float: 'Unknown'
Processing INV_4.pdf - Table Data:
     DESCRIPTION                                                     \
0  1\n2\n3\n4\n5  Levis - Jeans\nBlack T-Shirt\nAddidas Yellow S...   

  