In [4]:
# notebook to test out the basic functionlaites needed for the solution

In [40]:
# !pip install pdfplumber

In [34]:
import pytesseract
import pdfplumber
from pdf2image import convert_from_path
import warnings
import os
import re
warnings.filterwarnings('ignore')

In [35]:
# the user uploads the image or the pdf and we convert it and get data out of it
file_path = 'ABC Company - Invoice INV0001.pdf' # this is a sample invoice we shall be using to extract data
def read_invoice(invoice):
    """
    This method shall only accept PDF or Images of the invoice and we shall get the data out of it.
    """
    file_extension = os.path.splitext(file_path)[1].lower()
    
    if file_extension == '.pdf':
        # reading data from the pdf, handling pdf data
        text = ""
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text += page.extract_text() + "\n"
    else:
        image = Image.open(file_path)
        text = pytesseract.image_to_string(image)
    
    return text

In [36]:
txt = read_invoice(file_path)

In [41]:
def parse_invoice_metadata(text):
    """
    Parse through the data gathered by read_invoice()
    """
    lines = text.split('\n')

    # Initialize a dictionary to store metadata
    metadata = {
        "invoice_number": None,
        "date": None,
        "due_date": None,
        "total_amount": None,
        "customer_name": None,
        "customer_email": None,
        "business_name": None,
        "business_email": None
    }

    # Regex patterns for different fields
    invoice_pattern = re.compile(r"INV\d+")           
    date_pattern = re.compile(r"\b\w{3,9} \d{1,2}, \d{4}\b")  
    due_date_pattern = re.compile(r"due\b.*(?:\bon\b|\breceipt\b)", re.IGNORECASE)  
    amount_pattern = re.compile(r"CAD \$\d+(\.\d{2})?") 
    email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}") 

    # Extract values based on patterns
    for i, line in enumerate(lines):
        if not metadata["invoice_number"] and invoice_pattern.search(line):
            metadata["invoice_number"] = invoice_pattern.search(line).group()
        
        if not metadata["date"] and date_pattern.search(line):
            metadata["date"] = date_pattern.search(line).group()

        if not metadata["due_date"] and due_date_pattern.search(line):
            # Capture text around the due date (e.g., "On Receipt" or a specific date)
            metadata["due_date"] = line.strip()
        
        if not metadata["total_amount"] and amount_pattern.search(line):
            metadata["total_amount"] = amount_pattern.search(line).group()
        
        if not metadata["customer_email"] and email_pattern.search(line):
            metadata["customer_email"] = email_pattern.search(line).group()
        
        # Match customer name based on 'BILL TO' line, then store next line as name
        if "BILL TO" in line:
            metadata["customer_name"] = lines[i + 1].strip() if i + 1 < len(lines) else None
        
        # Capture business name by looking for lines before "INVOICE"
        if "INVOICE" in line and i > 0:
            metadata["business_name"] = lines[i - 1].strip()

        # Capture business email by looking for an email not already marked as customer email
        if email_pattern.search(line) and not metadata["business_email"]:
            email_match = email_pattern.search(line).group()
            # If it's not the customer email, assign it to business email
            if email_match != metadata["customer_email"]:
                metadata["business_email"] = email_match

    return metadata


In [43]:
parse_invoice_metadata(txt)

{'invoice_number': 'INV0001',
 'date': 'Nov 9, 2024',
 'due_date': None,
 'total_amount': 'CAD $105.00',
 'customer_name': 'John Doe',
 'customer_email': 'abc.inv@business.co',
 'business_name': '11/9/24, 12:53 PM ABC Company - Invoice INV0001',
 'business_email': 'John.doe@email.com'}