# <center> Benchmarking the Invoice Classification from email inbox </center>
***

In [1]:
import easyocr
import email
import os
from openai import AzureOpenAI
from dotenv import load_dotenv
import PyPDF2
import pdfplumber
import pymupdf
import base64
from mimetypes import guess_type
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from msrest.authentication import CognitiveServicesCredentials


### Initializing OpenAI client
***

In [2]:
load_dotenv()

GPT_KEY = os.getenv('GPT_KEY')
GPT_ENDPOINT = os.getenv('GPT_ENDPOINT')
GPT_VERSION = os.getenv('GPT_VERSION')
GPT_DEPLOYMENT_NAME = os.getenv('GPT_DEPLOYMENT_NAME')

client = AzureOpenAI(
    api_key = GPT_KEY,
    api_version = GPT_VERSION,
    azure_endpoint = GPT_ENDPOINT
)

In [3]:
def invoice_classifier(content):
    query = f'''Does the below content indicate an Invoice statement ?
            {content}
            // Yes or No.'''

    response = client.chat.completions.create(
                        model = GPT_DEPLOYMENT_NAME,
                        messages = [
                            {
                                "role": "system",
                                "content": [
                                    {
                                    "type": "text",
                                    "text": ''' You are an helpful AI assistant. You help users to decide whether the content provided by them represents an invoice or not. You can judge the content based on certain invoice-related keywords. A few of them include:
                                    - Invoice Number
                                    - Invoice Date
                                    - Statement
                                    - Total Due
                                    - Due Date
                                    - Bill To
                                    - Sold to
                                    - Ship To
                                    - Customer
                                    - Order Number
                                    - Purchase Order (PO) Number
                                    - Terms
                                    - Line Item
                                    - Quantity
                                    - Unit Price
                                    - Total
                                    - Subtotal
                                    - Tax
                                    - Discount
                                    - Shipping Cost
                                    - Balance Due
                                    - Amount Paid
                                    - Remit To
                                    - Description
                                    - Net Price
                                    - Contact Information
                                    If you find a **good amount of keywords in the given content**, you can **return your verdict as Yes, else No**. Give your verdict in a simple Yes/No format.
                                    '''
                                    }
                                ]
                            },
                            {
                                "role" : "user",
                                "content": [
                                    {
                                    "type": "text",
                                    "text": f"{query}"
                                    },
                                ]
                            },
                        ],
                        max_tokens = 100,
                        temperature = 0,
                        top_p = 0.95,
                        seed=101
                    )

    return(response.choices[0].message.content)

### Classify document type and its corresponding extarction method
***

In [None]:
def read_document(file_path, flag = 0):
    _, file_extension = os.path.splitext(file_path)
    
    if file_extension.lower() == '.txt':
        return read_txt(file_path)
    elif file_extension.lower() == '.pdf' :
        return (read_pdf_pypdf2(file_path) if flag == 0 else read_pdf_pdfPlumber(file_path))
    elif file_extension.lower() == '.docx':
        return read_docx(file_path)
    elif file_extension.lower() == '.doc':
        return read_doc(file_path)
    elif file_extension.lower() in ['.jpg', '.jpeg', '.png']:
        return (read_img_azure_ocr(file_path) if flag == 0 else read_img(file_path))
    elif file_extension.lower() == '.eml':
        return read_eml(file_path)
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")

### To create a dictionary of files for monitoring their consistency

In [None]:
def get_files_dict(folder_path):
    # Get the list of files in the specified folder
    files_list = os.listdir(folder_path)
    
    # Create a dictionary with filenames as keys and values initialized to 0
    files_dict = {file_name: 0 for file_name in files_list if os.path.isfile(os.path.join(folder_path, file_name))}
    
    return files_dict

### PDF information extraction using PyPDF2
***

In [None]:
def read_pdf_pypdf2(file_path):
    try:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            content = ""
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                content += page.extract_text()
        return content
    
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

Positive Examples
***

In [None]:
tpr = 0
for pdf in os.listdir('attachments/Positive Examples/pdfs/'):
    file_path = f'attachments/Positive Examples/pdfs/{pdf}'
    content = read_pdf_pypdf2(file_path)
    isInvoice = invoice_classifier(content)
    tpr += 1 if "Yes" in isInvoice else 0
    print(f"{pdf} is Invoice: {isInvoice}")

print("-------------------------------------------------------------------------------------------------")
print(f"\n\nTrue Positive Rate: {tpr}/{len(os.listdir('attachments/Positive Examples/pdfs/'))}")

Negative Examples
***

In [None]:
tpr = 0
for pdf in os.listdir('attachments/Negative Examples/pdfs/'):
    file_path = f'attachments/Negative Examples/pdfs/{pdf}'
    content = read_pdf_pypdf2(file_path)
    isInvoice = invoice_classifier(content)
    tpr += 1 if "Yes" in isInvoice else 0
    print(f"{pdf} is Invoice: {isInvoice}")

print("-------------------------------------------------------------------------------------------------")
print(f"\n\nTrue Positive Rate: {tpr}/{len(os.listdir('attachments/Negative Examples/pdfs/'))}")

Iterative testing - Consistency check
***


In [None]:
test_dict = get_files_dict('attachments/Positive Examples/pdfs/')
iterations = 10

for i in range(iterations):
    tpr = 0
    for pdf in os.listdir('attachments/Positive Examples/pdfs/'):
        file_path = f'attachments/Positive Examples/pdfs/{pdf}'
        content = read_pdf_pypdf2(file_path)
        isInvoice = invoice_classifier(content)
        tpr += 1 if "Yes" in isInvoice else 0
        test_dict[pdf] += 1 if "Yes" in isInvoice else 0    
    
    print(f"Iteration-{i+1} -> True Positive Rate: {tpr}/{len(os.listdir('attachments/Positive Examples/pdfs/'))}")
    print("-------------------------------------------------------------------------------------------------\n")

for key, value in test_dict.items():
    print(f"{key} -> {value}/{iterations}")

### PDF information extraction using PDF Plumber
***


In [None]:
def read_pdf_pdfPlumber(file_path):
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() + "\n"
        return text
    except Exception as e:
        print(f"Failed to extract text from PDF: {e}")
        return ""


Positive Examples
***

In [None]:
tpr = 0
for pdf in os.listdir('attachments/Positive Examples/pdfs/'):
    file_path = f'attachments/Positive Examples/pdfs/{pdf}'
    content = read_pdf_pdfPlumber(file_path)
    isInvoice = invoice_classifier(content)
    tpr += 1 if "Yes" in isInvoice else 0
    print(f"{pdf} is Invoice: {isInvoice}")

print("-------------------------------------------------------------------------------------------------")
print(f"\n\nTrue Positive Rate: {tpr}/{len(os.listdir('attachments/Positive Examples/pdfs/'))}")

Negative Examples
***


In [None]:
tpr = 0
for pdf in os.listdir('attachments/Negative Examples/pdfs/'):
    file_path = f'attachments/Negative Examples/pdfs/{pdf}'
    content = read_pdf_pdfPlumber(file_path)
    isInvoice = invoice_classifier(content)
    tpr += 1 if "Yes" in isInvoice else 0
    print(f"{pdf} is Invoice: {isInvoice}")

print("-------------------------------------------------------------------------------------------------")
print(f"\n\nTrue Positive Rate: {tpr}/{len(os.listdir('attachments/Negative Examples/pdfs/'))}")

Iterative testing - Consistency check
***

In [None]:
test_dict = get_files_dict('attachments/Positive Examples/pdfs/')
iterations = 10

for i in range(iterations):
    tpr = 0
    for pdf in os.listdir('attachments/Positive Examples/pdfs/'):
        file_path = f'attachments/Positive Examples/pdfs/{pdf}'
        content = read_pdf_pdfPlumber(file_path)
        isInvoice = invoice_classifier(content)
        tpr += 1 if "Yes" in isInvoice else 0
        test_dict[pdf] += 1 if "Yes" in isInvoice else 0 

    print(f"Iteration-{i+1} -> True Positive Rate: {tpr}/{len(os.listdir('attachments/Positive Examples/pdfs/'))}")
    print("-------------------------------------------------------------------------------------------------\n")

for key, value in test_dict.items():
    print(f"{key} -> {value}/{iterations}")

### PDF information extraction using PDF Plumber + EasyOCR
***

In [None]:
def read_pdf_as_image_easyocr(file_path, image_path='attachments/Positive Examples/images/sample.png'):
    try:
        # print("Reading PDF as image using Easy OCR...")
        content = ''
        doc = pymupdf.open(file_path)
        for page in doc:
            pix = page.get_pixmap() 
            pix.save(image_path)
            
            reader = easyocr.Reader(['en'], verbose=False)
            result = reader.readtext(image_path)
            content = ' '.join([text[1] for text in result])
            return content
            
    except Exception as e:
        print(f"Failed to extract text from PDF: {e}")
        return ""

In [None]:
tpr = 0
for pdf in os.listdir('attachments/Positive Examples/pdfs/'):
    file_path = f'attachments/Positive Examples/pdfs/{pdf}'
    content = read_pdf_pdfPlumber(file_path)
    isInvoice = invoice_classifier(content)
    
    if "No" in isInvoice:
        content = read_pdf_as_image_easyocr(file_path)
        isInvoice = invoice_classifier(content)
        
    tpr += 1 if "Yes" in isInvoice else 0
    print(f"{pdf} is Invoice: {isInvoice}")

print("-------------------------------------------------------------------------------------------------")
print(f"\n\nTrue Positive Rate: {tpr}/{len(os.listdir('attachments/Positive Examples/pdfs/'))}")

Iterative testing - Consistency check
***

In [None]:
test_dict = get_files_dict('attachments/Positive Examples/pdfs/')
iterations = 10

for i in range(iterations):
    tpr = 0
    for pdf in os.listdir('attachments/Positive Examples/pdfs/'):
        file_path = f'attachments/Positive Examples/pdfs/{pdf}'
        content = read_pdf_pdfPlumber(file_path)
        isInvoice = invoice_classifier(content)
        
        if "No" in isInvoice:
            content = read_pdf_as_image_easyocr(file_path)
            isInvoice = invoice_classifier(content)
            
        tpr += 1 if "Yes" in isInvoice else 0
        test_dict[pdf] += 1 if "Yes" in isInvoice else 0
        
    print(f"Iteration-{i+1} -> True Positive Rate: {tpr}/{len(os.listdir('attachments/Positive Examples/pdfs/'))}")
    print("-------------------------------------------------------------------------------------------------\n")

for key, value in test_dict.items():
    print(f"{key} -> {value}/{iterations}")

### PDF information extraction from EML using 'email' package
***


In [None]:
def read_eml(file_path, output_dir='attachments/Positive Examples/eml_downloads/'):  
    try:
        attachments_content = dict()
        with open(file_path, 'r') as file:
            msg = email.message_from_file(file)
            for part in msg.walk():
                if part.get_content_maintype() == 'multipart':
                    continue
                if part.get('Content-Disposition') is None:
                    continue

                file_name = part.get_filename()
                if file_name:
                    filepath = os.path.join(output_dir, file_name)
                    with open(filepath, 'wb') as f:
                        f.write(part.get_payload(decode=True))
                    # print(f"Attachment {file_name} downloaded.")
                    content = read_document(output_dir + file_name)
                    attachments_content[file_name] = content

        return attachments_content
    
    except Exception as e:
        print(f"Error reading EML: {e}")
        return ""

In [None]:
tpr = 0
for eml in os.listdir('attachments/Positive Examples/emls/'):
    file_path = f'attachments/Positive Examples/emls/{eml}'
    attachments_content = read_eml(file_path)
    for content in attachments_content:
        isInvoice = invoice_classifier(attachments_content[content])
        tpr += 1 if "Yes" in isInvoice else 0
        print(f"{content} is Invoice: {isInvoice}\n")

print("-------------------------------------------------------------------------------------------------")
print(f"\n\nTrue Positive Rate: {tpr}/{len(os.listdir('attachments/Positive Examples/emls/'))}")

 
Iterative testing - Consistency check
***

In [None]:
test_dict = get_files_dict('attachments/Positive Examples/emls/')
iterations = 10

for i in range(iterations):
    for eml in os.listdir('attachments/Positive Examples/emls/'):
        file_path = f'attachments/Positive Examples/emls/{eml}'
        attachments_content = read_eml(file_path)
        pr = 0
        for content in attachments_content:
            isInvoice = invoice_classifier(attachments_content[content])
            pr += 1 if "Yes" in isInvoice else 0
            
        test_dict[eml] += 1 if len(attachments_content.keys()) == pr else 0 
                
    print(f"Iteration-{i+1} -> True Positive Rate: {(int)(sum(test_dict.values())/(i+1))}/{len(os.listdir('attachments/Positive Examples/emls/'))}")
    print("-------------------------------------------------------------------------------------------------\n")

for key, value in test_dict.items():
    print(f"{key} -> {value}/{iterations}")

### Information extraction from image using 'EasyOCR' package
***

In [None]:
def read_img_easyocr(file_path):
    try:
        reader = easyocr.Reader(['en'], verbose=False)
        result = reader.readtext(file_path)
        content = ' '.join([text[1] for text in result])
        return content
    
    except Exception as e:
        print(f"Error reading Image: {e}")
        return ""

Positive Examples
***

In [None]:
tpr = 0
for img in os.listdir('attachments/Positive Examples/images/'):
    file_path = f'attachments/Positive Examples/images/{img}'
    content = read_img_easyocr(file_path)
    isInvoice = invoice_classifier(content)
    tpr += 1 if "Yes" in isInvoice else 0
    print(f"{img} is Invoice: {isInvoice}")

print("-------------------------------------------------------------------------------------------------")
print(f"\n\nTrue Positive Rate: {tpr}/{len(os.listdir('attachments/Positive Examples/images/'))}")

In [None]:
tpr = 0
for img in os.listdir('attachments/Negative Examples/images/')[:3]:
    file_path = f'attachments/Negative Examples/images/{img}'
    content = read_img_easyocr(file_path)
    isInvoice = invoice_classifier(content)
    tpr += 1 if "Yes" in isInvoice else 0
    print(f"{img} is Invoice: {isInvoice}")

print("-------------------------------------------------------------------------------------------------")
print(f"\n\nTrue Positive Rate: {tpr}/{len(os.listdir('attachments/Negative Examples/images/'))}")

    
Iterative testing - Consistency check
***

In [None]:
test_dict = get_files_dict('attachments/Positive Examples/images/')
iterations = 10

for i in range(iterations):
    tpr = 0
    for img in os.listdir('attachments/Positive Examples/images/'):
        file_path = f'attachments/Positive Examples/images/{img}'
        content = read_img_easyocr(file_path)
        isInvoice = invoice_classifier(content)
        tpr += 1 if "Yes" in isInvoice else 0
        test_dict[img] += 1 if "Yes" in isInvoice else 0

    print(f"Iteration-{i+1} -> True Positive Rate: {tpr}/{len(os.listdir('attachments/Positive Examples/images/'))}")
    print("-------------------------------------------------------------------------------------------------\n")

for key, value in test_dict.items():
    print(f"{key} -> {value}/{iterations}")

### Information extraction from Image using 'Azure OCR' package
***

In [None]:
def read_img_azure_ocr(file_path):
    try:
        content = ''
        load_dotenv()
        subscription_key = os.environ["VISION_KEY"]
        endpoint = os.environ["VISION_ENDPOINT"]
        computervision_client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(subscription_key))
        
        with open(file_path, "rb") as image_stream:
            ocr_result = computervision_client.recognize_printed_text_in_stream(image_stream)

        for region in ocr_result.regions:
            for line in region.lines:
                line_text = " ".join([word.text for word in line.words])
                content += line_text + ' '
        
        return content

    except Exception as e:
        print(f"Error reading Image: {e}")
        return ""

Positive Examples
***

In [None]:
tpr = 0
for img in os.listdir('attachments/Positive Examples/images/')[:20]:
    file_path = f'attachments/Positive Examples/images/{img}'
    content = read_img_azure_ocr(file_path)
    isInvoice = invoice_classifier(content)
    tpr += 1 if "Yes" in isInvoice else 0
    print(f"{img} is Invoice: {isInvoice}")

print("-------------------------------------------------------------------------------------------------")
print(f"\n\nTrue Positive Rate: {tpr}/{len(os.listdir('attachments/Positive Examples/images/'))}")

Negative Examples
***

In [None]:
tpr = 0
for img in os.listdir('attachments/Negative Examples/images/'):
    file_path = f'attachments/Negative Examples/images/{img}'
    content = read_img_azure_ocr(file_path)
    isInvoice = invoice_classifier(content)
    tpr += 1 if "Yes" in isInvoice else 0
    print(f"{img} is Invoice: {isInvoice}")

print("-------------------------------------------------------------------------------------------------")
print(f"\n\nTrue Positive Rate: {tpr}/{len(os.listdir('attachments/Negative Examples/images/'))}")

### PDF information extraction using PDF Plumber + Azure OCR
***


In [None]:
def read_pdf_as_image_azureocr(file_path, image_path='attachments/Positive Examples/images/sample.png'):
    print("Reading PDF as image using Azure OCR...")
    try:
        content = ''
        doc = pymupdf.open(file_path)
        for page in doc:
            pix = page.get_pixmap() 
            pix.save(image_path)
            content += read_img_azure_ocr(image_path)
        return content
            
    except Exception as e:
        print(f"Failed to extract text from PDF: {e}")
        return ""

In [None]:
tpr = 0
for pdf in os.listdir('attachments/Positive Examples/pdfs/')[0:20]:
    file_path = f'attachments/Positive Examples/pdfs/{pdf}'
    content = read_pdf_pdfPlumber(file_path)
    isInvoice = invoice_classifier(content)
    
    if "No" in isInvoice:
        content = read_pdf_as_image_azureocr(file_path)
        isInvoice = invoice_classifier(content)
        
    tpr += 1 if "Yes" in isInvoice else 0
    print(f"{pdf} is Invoice: {isInvoice}")

print("-------------------------------------------------------------------------------------------------")
print(f"\n\nTrue Positive Rate: {tpr}/{len(os.listdir('attachments/Positive Examples/pdfs/'))}")

### Information extraction from Image using GPT-4o
***

In [4]:
def local_image_to_data_url(image_path):
    mime_type, _ = guess_type(image_path)
    if mime_type is None:
        mime_type = 'application/octet-stream'

    with open(image_path, "rb") as image_file:
        base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8')

    return f"data:{mime_type};base64,{base64_encoded_data}"

In [5]:
def read_img_gpt_4o(image_url):
        query = "Extract the text from the image."
        response = client.chat.completions.create(
                            model = os.getenv('GPT_DEPLOYMENT_NAME'),
                            messages = [
                                {
                                    "role": "system",
                                    "content": [
                                        {
                                        "type": "text",
                                        "text": ''' You are an helpful AI assistant and your job is to extract the text in the image and provide it to the user. You can use the image to text conversion techniques to extract the text. Once you have extracted the text, you can provide it to the user. Just output the extracted text and nothing else.'''
                                        }
                                    ]
                                },
                                {
                                    "role" : "user",
                                    "content": [
                                        {
                                            "type": "text",
                                            "text": f"{query}"
                                        },
                                        {
                                            "type": "image_url",
                                            "image_url": {
                                                "url" : f"{image_url}"
                                            }
                                        }
                                        
                                    ]
                                },
                            ],
                            max_tokens = 2000,
                            temperature = 0,
                            top_p = 1
                        )

        return response.choices[0].message.content

Positive Examples
***

In [7]:
image_directory = "attachments/Positive Examples/images/"

data_urls = []

for img in os.listdir(image_directory):
    image_path = image_directory + img
    image_url = local_image_to_data_url(image_path)
    content = read_img_gpt_4o(image_url)
    isInvoice = invoice_classifier(content)
    print(f"{img} is Invoice: {isInvoice}")

1020623.png is Invoice: Yes
20240627145459374.png is Invoice: Yes
24100044779.png is Invoice: Yes
33377.png is Invoice: Yes
721141.png is Invoice: Yes
INSU-47699_2024_6_28_22_36_15.png is Invoice: Yes
Invoice-291870.5-1.pdf.png is Invoice: Yes
Invoice_INV484574.png is Invoice: Yes
INV_0052120528.png is Invoice: Yes
Inv_96116_from_Wilmanco_Inc._8372.png is Invoice: Yes
sample.png is Invoice: Yes
Seco_Invoice_0000475386.png is Invoice: Yes
Statement_1759_from_Green_Circuits_Inc.pdf.png is Invoice: Yes
TRIMBLE 290340.png is Invoice: Yes


Negative Examples
***

In [8]:
image_directory = "attachments/Negative Examples/images/"

data_urls = []

for img in os.listdir(image_directory):
    image_path = image_directory + img
    image_url = local_image_to_data_url(image_path)
    content = read_img_gpt_4o(image_url)
    isInvoice = invoice_classifier(content)
    print(f"{img} is Invoice: {isInvoice}")

1.jpg is Invoice: No
2.png is Invoice: No


### Vendor/Supplier Name Extraction
***

In [None]:
def extract_supplier_name(content):
    query = f'''Extract the vendor/supplier name from the below invoice content:
            {content}'''

    response = client.chat.completions.create(
                        model = GPT_DEPLOYMENT_NAME,
                        messages = [
                            {
                                "role": "system",
                                "content": [
                                    {
                                    "type": "text",
                                    "text": ''' You are an helpful AI assistant. You help users to extract the vendor/supplier name from the given invoice content. The vendor/supplier name is usually found in the:
                                    - ** Remit To ** section of the invoice
                                    - ** Header ** of the invoice
                                    of the invoice. It is the name of the company or individual that is providing the goods or services. It is usually followed by the vendor/supplier address. You should extract the vendor/supplier name by looking at the first line of Remit address. If the vendor/supplier name is not present in the content, you can return a message saying that the vendor/supplier name is not found.
                                    ** Just return the name of the vendor/supplier and nothing else. **
                                    '''
                                    }
                                ]
                            },
                            {
                                "role" : "user",
                                "content": [
                                    {
                                    "type": "text",
                                    "text": f"{query}"
                                    },
                                ]
                            },
                        ],
                        max_tokens = 100,
                        temperature = 0,
                        top_p = 0.95,
                        seed=101
                    )

    return(response.choices[0].message.content)

In [None]:
for pdf in os.listdir('attachments/Positive Examples/pdfs/'):
    file_path = f'attachments/Positive Examples/pdfs/{pdf}'
    content = read_pdf_pdfPlumber(file_path)
    supplier_name = extract_supplier_name(content)
    if "not found" in supplier_name:
        content = read_pdf_as_image_azureocr(file_path)
        supplier_name = extract_supplier_name(content)
    print(f"Supplier Name in {pdf}: {supplier_name}")
read_img_azure_ocr('attachments/Positive Examples/images/sample.png')
