<a href="https://colab.research.google.com/github/saish23/Deep-Learning-Projects-/blob/main/Invoice_Data_Extractor_for_Multiple_Pages.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!sudo apt-get install -y tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 16 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (4,024 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

In [15]:
!sudo apt-get install -y tesseract-ocr
!pip install pytesseract

import cv2
import pytesseract
import json
import os
import numpy as np
import re
from tabulate import tabulate

# Specify the paths to the folders
image_folder = "/content/IMP"
output_folder = "/content/OUT"

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Initialize Tesseract OCR
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

# Define regular expressions for pattern matching
invoice_number_pattern = r"Invoice (\d+)"
invoice_date_pattern = r"Date (\d{2}/\d{2}/\d{4})"
gst_number_pattern = r"GST Number: ([\w\d-]+)"
vendor_name_pattern = r"Vendor Name: (.+)"
vendor_address_pattern = r"Vendor Address: (.+)"
delivery_address_pattern = r"Delivery Address: (.+)"
buyer_name_pattern = r"Buyer Name: (.+)"
buyer_address_pattern = r"Buyer Address: (.+)"
item_details_pattern = r"(\d+)\.\s(.+)\s\$(\d+\.\d+)\s(\d+)\s\$(\d+\.\d+)"
total_invoice_amount_pattern = r"Subtotal: (\d+\.\d+)"
total_tax_amount_pattern = r"Tax rate: (\d+\.\d+)%"
po_number_pattern = r"PO Number: (\d+)"

# Define regular expression for extracting tabular data
tabular_data_pattern = r"(\d+)\s+(.+)\s+(\d+\.\d+)\s+(\d+)\s+(\d+\.\d+)"

# Iterate through each file in the image folder
for filename in os.listdir(image_folder):
    if filename.lower().endswith(".jpg"):
        image_path = os.path.join(image_folder, filename)

        # Load and preprocess the image
        image = cv2.imread(image_path)
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        _, thresholded = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        # Extract text from image using OCR
        extracted_text = pytesseract.image_to_string(thresholded)

        # Extract tabular data using tabulate
        tabular_data_matches = re.findall(tabular_data_pattern, extracted_text, re.MULTILINE)

        # Process tabular data using tabulate
        tabular_data_headers = ["Product Description", "Price", "Quantity", "Total"]
        tabular_data = [tabular_data_headers] + tabular_data_matches

        # Convert tabular data to a formatted table string
        tabular_table = tabulate(tabular_data, headers="firstrow", tablefmt="grid")

        # Extract data using regular expressions
        invoice_number_match = re.search(invoice_number_pattern, extracted_text)
        invoice_date_match = re.search(invoice_date_pattern, extracted_text)
        gst_number_match = re.search(gst_number_pattern, extracted_text)
        vendor_name_match = re.search(vendor_name_pattern, extracted_text)
        vendor_address_match = re.search(vendor_address_pattern, extracted_text)
        delivery_address_match = re.search(delivery_address_pattern, extracted_text)
        buyer_name_match = re.search(buyer_name_pattern, extracted_text)
        buyer_address_match = re.search(buyer_address_pattern, extracted_text)
        item_details_matches = re.findall(item_details_pattern, extracted_text)
        total_invoice_amount_match = re.search(total_invoice_amount_pattern, extracted_text)
        total_tax_amount_match = re.search(total_tax_amount_pattern, extracted_text)
        po_number_match = re.search(po_number_pattern, extracted_text)

        # Check if matches were found before accessing groups
        invoice_number = invoice_number_match.group(1) if invoice_number_match else None
        invoice_date = invoice_date_match.group(1) if invoice_date_match else None
        gst_number = gst_number_match.group(1) if gst_number_match else None
        vendor_name = vendor_name_match.group(1) if vendor_name_match else None
        vendor_address = vendor_address_match.group(1) if vendor_address_match else None
        delivery_address = delivery_address_match.group(1) if delivery_address_match else None
        buyer_name = buyer_name_match.group(1) if buyer_name_match else None
        buyer_address = buyer_address_match.group(1) if buyer_address_match else None
        total_invoice_amount = total_invoice_amount_match.group(1) if total_invoice_amount_match else None
        total_tax_amount = total_tax_amount_match.group(1) if total_tax_amount_match else None
        po_number = po_number_match.group(1) if po_number_match else None

        # Organize extracted data into a dictionary
        data_dict = {
            "invoice_number": invoice_number,
            "invoice_date": invoice_date,
            "gst_number": gst_number,
            "vendor_name": vendor_name,
            "vendor_address": vendor_address,
            "delivery_address": delivery_address,
            "buyer_name": buyer_name,
            "buyer_address": buyer_address,
            "item_details": item_details_matches,
            "total_invoice_amount": total_invoice_amount,
            "total_tax_amount": total_tax_amount,
            "po_number": po_number,
            "extracted_text": extracted_text,
            "tabular_data": tabular_table,

            # Add other extracted data here
        }

        # Serialize data to JSON and save to a file
        output_filename = os.path.join(output_folder, filename.replace(".jpg", ".json"))
        json_output = json.dumps(data_dict, indent=4)
        with open(output_filename, 'w') as json_file:
            json_file.write(json_output)


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.


In [18]:
extracted_text

'WAG srand Name\nee TNVOIC\n\nInvoice to:\n\nDayanne $.Clone Invoice 05987\n6~ Unknown Street Date 94/01/2019\nLocation, Lorem Ipsum\n\n12345678-B\n\n   \n\n \n\n \n\n \n\nets Total\n1. Product Name $50.00 4 $ 200.00\n1, Product Name $ 10.00 1 $ 10.00\n1. Product Name $25.00 3 $75.00\nProduct Name 3 30.08 2 $ 60.00\nProduct Name $ 40.00 5 $ 200.00\nSubtotal: 5 545.00\nTax rate: 0.00%\nTOTAL 545.00\nThank you for you business\nTerms & Conditions Payment. info\nSimply dunmy text of the printing Simply dunny text of the printing\n\nand typesetting industry. and typesetting industry.\n\x0c'

In [16]:
data_dict

{'invoice_number': '05987',
 'invoice_date': '94/01/2019',
 'gst_number': None,
 'vendor_name': None,
 'vendor_address': None,
 'delivery_address': None,
 'buyer_name': None,
 'buyer_address': None,
 'item_details': [('1', 'Product Name', '25.00', '3', '75.00')],
 'total_invoice_amount': None,
 'total_tax_amount': '0.00',
 'po_number': None,
 'extracted_text': 'WAG srand Name\nee TNVOIC\n\nInvoice to:\n\nDayanne $.Clone Invoice 05987\n6~ Unknown Street Date 94/01/2019\nLocation, Lorem Ipsum\n\n12345678-B\n\n   \n\n \n\n \n\n \n\nets Total\n1. Product Name $50.00 4 $ 200.00\n1, Product Name $ 10.00 1 $ 10.00\n1. Product Name $25.00 3 $75.00\nProduct Name 3 30.08 2 $ 60.00\nProduct Name $ 40.00 5 $ 200.00\nSubtotal: 5 545.00\nTax rate: 0.00%\nTOTAL 545.00\nThank you for you business\nTerms & Conditions Payment. info\nSimply dunmy text of the printing Simply dunny text of the printing\n\nand typesetting industry. and typesetting industry.\n\x0c',

In [17]:
# Print the saved JSON file
print(f"Saved JSON file: {output_filename}")

# Read and print the contents of the saved JSON file
with open(output_filename, 'r') as json_file:
    saved_json_contents = json_file.read()
    print(saved_json_contents)

Saved JSON file: /content/OUT/2019042512-invoice-template-red.json-840.json
{
    "invoice_number": "05987",
    "invoice_date": "94/01/2019",
    "gst_number": null,
    "vendor_name": null,
    "vendor_address": null,
    "delivery_address": null,
    "buyer_name": null,
    "buyer_address": null,
    "item_details": [
        [
            "1",
            "Product Name",
            "25.00",
            "3",
            "75.00"
        ]
    ],
    "total_invoice_amount": null,
    "total_tax_amount": "0.00",
    "po_number": null,
    "extracted_text": "WAG srand Name\nee TNVOIC\n\nInvoice to:\n\nDayanne $.Clone Invoice 05987\n6~ Unknown Street Date 94/01/2019\nLocation, Lorem Ipsum\n\n12345678-B\n\n   \n\n \n\n \n\n \n\nets Total\n1. Product Name $50.00 4 $ 200.00\n1, Product Name $ 10.00 1 $ 10.00\n1. Product Name $25.00 3 $75.00\nProduct Name 3 30.08 2 $ 60.00\nProduct Name $ 40.00 5 $ 200.00\nSubtotal: 5 545.00\nTax rate: 0.00%\nTOTAL 545.00\nThank you for you business\nTerms 