<a href="https://colab.research.google.com/github/soan12345/OCR/blob/main/BEST_OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pdfplumber
import re
import json

class ElectricityBillParser:
    def __init__(self, pdf_path: str):
        self.pdf_path = pdf_path
        self.text = self._extract_text()

    def _extract_text(self) -> str:
        """Extract text from the PDF."""
        with pdfplumber.open(self.pdf_path) as pdf:
            return ' '.join(page.extract_text() for page in pdf.pages)

    def _find_pattern(self, pattern: str, default: str = '') -> str:
        """Helper method to find a pattern in the text."""
        match = re.search(pattern, self.text, re.IGNORECASE | re.MULTILINE)
        return match.group(1).strip() if match else default

    def get_bill_details(self) -> dict:
        return {
            "bill_for": self._find_pattern(r'Bill For\s*:\s*([A-Za-z]+-\d{4})'),
            "date_of_bill": self._find_pattern(r'Date of Bill\s*:\s*(\d{2}/\d{2}/\d{4})'),
            "invoice_no": self._find_pattern(r'Invoice No\.\s*:\s*(\d+)'),
            "book_folio_no": self._find_pattern(r'Book Folio No\.*\s*(\d+)'),
            "consumer_no": self._find_pattern(r'Consumer No\.*\s*(\d+[\*\d]*)'),
            "c_a_no": self._find_pattern(r"C\.?A\.? No\.?\s*[:\-]?\s*(\d+)"),
            "cycle": self._find_pattern(r"Cycle\s*:\s*(\d+)"),
            "type_of_supply": self._find_pattern(r"Type of Supply\s*:\s*(\d+P|LT\s*[A-Z\s\-]+)"),
            "service_no": self._find_pattern(r'Service No\.?\s*(\d+-[A-Z]-[A-Z])'),
            "installation_no": self._find_pattern(r'Installation No\.?\s*(\d+)'),
            "sanctioned_load": self._find_pattern(r'Sanctioned Load\s*:\s*([\d\.]+ KW)'),
            "security_deposit": self._find_pattern(r'Security Deposit\s*:?\s*(\d+\.?\d*)'),
            "last_payment_received": self._find_pattern(r'Last Payment Received\s*:?\s*(\d+\.?\d*)'),
            "last_payment_received_date": self._find_pattern(r'Last Payment Date\s*:?\s*(\d{2}/\d{2}/\d{4})'),
            "bill_period_from": self._find_pattern(r'Bill Period From\s*:?\s*(\d{2}/\d{2}/\d{4})'),
            "bill_period_to": self._find_pattern(r'Bill Period To\s*:?\s*(\d{2}/\d{2}/\d{4})'),
            "tariff": self._find_pattern(r'Tariff\s*:?\s*(LT-[A-Z\s]+)'),
            "category": self._find_pattern(r'Category\s*:?\s*(RESIDENTIAL|COMMERCIAL|INDUSTRIAL)'),
            "ward": self._find_pattern(r'\b([A-Z]N)\b')
        }

    def get_bill_amounts(self) -> dict:
        return {
            "current_bill_amount": self._find_pattern(r'Current Bill Amount\s*[`₹]\s*(\d+\.?\d*)'),
            "past_due": self._find_pattern(r'Past Dues\s*[`₹]\s*(\d+\.?\d*)'),
            "due_date": self._find_pattern(r'Due Date\s*(\d{2}/\d{2}/\d{4})'),
            "bill_amount_before_due_date": self._find_pattern(r'Bill Amount Before Due Date\s*[`₹]\s*(\d+\.?\d*)'),
            "bill_amount_after_due_date": self._find_pattern(r'Bill Amount After Due Date\s*[`₹]\s*(\d+\.?\d*)')
        }

    def get_customer_details(self) -> dict:
        name_match = re.search(r'Name\s*:\s*([^\n]+?)(?=\s*Mobile No)', self.text, re.IGNORECASE)
        name = name_match.group(1).strip() if name_match else ''
        return {
            "name": name,
            "mobile_no": self._find_pattern(r'Mobile No\s*:\s*(\d+[X]*\d+)'),
            "email_id": self._find_pattern(r'Email ID\s*:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'),
            "billing_address": self._find_pattern(r'Billing Address\s*:\s*([^\n]+?)(?=Power Supply Address|$)'),
            "power_supply_address": self._find_pattern(r'Power Supply Address\s*:\s*([^\n]+?)(?=Bill For|$)')
        }

    def get_important_contacts(self) -> dict:
        return {
            "customer_care_office_address": self._find_pattern(r'Customer Care Office[^:]*:\s*([^\n]+?)(?=Phone|$)'),
            "customer_care_office_phone": self._find_pattern(r'Customer Care Phone\s*:\s*(\d+)'),
            "customer_care_office_email": self._find_pattern(r'Customer Care Email\s*:\s*([^\s]+@[^\s]+)'),
            "complaint_office_phone": self._find_pattern(r'Complaint Office Phone\s*:\s*([^\n]+)'),
            "complaint_office_email": self._find_pattern(r'Complaint Office Email\s*:\s*([^\s]+@[^\s]+)'),
            "for_street_lighting_complaints": {
                "phone": self._find_pattern(r'Street Light Complaints Phone\s*:\s*(\d+)'),
                "email": self._find_pattern(r'Street Light Complaints Email\s*:\s*([^\n]+)')
            }
        }

    def get_additional_information(self) -> dict:
        return {
            "pm_surya_ghar": {
                "title": "PM Surya Ghar - Mukt Bijli Ghara",
                "description": self._find_pattern(r'PM Surya Ghar[^:]*:\s*([^\n]+?)(?=For more information|$)'),
                "link": self._find_pattern(r'(?:https?://)?(?:www\.)?mysolarplant\.in'),
                "phone": self._find_pattern(r'Solar Phone\s*:\s*(\d+)'),
                "email": self._find_pattern(r'Solar Email\s*:\s*([^\s]+@[^\s]+)')
            },
            "payment_options": {
                "title": "Pay Bills on miBEST",
                "description": self._find_pattern(r'miBEST Description\s*:\s*([^\n]+)'),
                "link": "https://www.bestundertaking.com",
                "app_download_links": {
                    "ios": self._find_pattern(r'iOS App\s*:\s*(https://[^\s]+)'),
                    "android": self._find_pattern(r'Android App\s*:\s*(https://[^\s]+)')
                }
            }
        }

    def get_units_consumed(self) -> list:
        units_pattern = r'([A-Za-z]+-\d{2})\s*(\d+)'
        matches = re.findall(units_pattern, self.text)
        return [{"month": month, "units": int(units)} for month, units in matches]

    def extract_all_data(self) -> dict:
        return {
            "bill_details": self.get_bill_details(),
            "bill_amounts": self.get_bill_amounts(),
            "customer_details": self.get_customer_details(),
            "important_contacts": self.get_important_contacts(),
            "additional_information": self.get_additional_information(),
            "units_consumed": self.get_units_consumed()
        }

def parse_electricity_bill(pdf_path: str) -> dict:
    """Function to parse the electricity bill and return structured data."""
    parser = ElectricityBillParser(pdf_path)
    return parser.extract_all_data()


In [None]:
from google.colab import files
uploaded = files.upload()

Saving 604571021_2408.pdf to 604571021_2408.pdf


In [None]:
import io
import pdfplumber

# Get filename of uploaded file
filename = next(iter(uploaded))

# Parse and print results
data = parse_electricity_bill(filename)
print(json.dumps(data, indent=2))

{
  "bill_details": {
    "bill_for": "Aug-2024",
    "date_of_bill": "16/08/2024",
    "invoice_no": "408604571021",
    "book_folio_no": "",
    "consumer_no": "",
    "c_a_no": "",
    "cycle": "",
    "type_of_supply": "",
    "service_no": "",
    "installation_no": "",
    "sanctioned_load": "",
    "security_deposit": "",
    "last_payment_received": "",
    "last_payment_received_date": "",
    "bill_period_from": "",
    "bill_period_to": "",
    "tariff": "",
    "category": "",
    "ward": "on"
  },
  "bill_amounts": {
    "current_bill_amount": "",
    "past_due": "",
    "due_date": "",
    "bill_amount_before_due_date": "",
    "bill_amount_after_due_date": ""
  },
  "customer_details": {
    "name": "",
    "mobile_no": "98XXXXX156",
    "email_id": "XXXXXXXXada@gmail.com",
    "billing_address": "Type of Supply Bill Period",
    "power_supply_address": "Security Deposit Ward"
  },
  "important_contacts": {
    "customer_care_office_address": "",
    "customer_care_offic

In [None]:
# Save to JSON file
with open('extracted_data.json', 'w') as f:
    json.dump(data, f, indent=2)

# Download the JSON file
files.download('extracted_data.json')

In [None]:
!apt install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
#!pip install pytesseract pdf2image PyPDF2 pillow

import pytesseract
from pdf2image import convert_from_path
import json
import re
from PIL import Image
import os

# Set Tesseract OCR path (modify this if needed for your environment)
#pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update for your OS

# Function to preprocess and perform OCR on PDF pages
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF using OCR."""
    try:
        pages = convert_from_path(pdf_path, dpi=300)  # Adjust DPI for better OCR accuracy
        extracted_text = ""

        for page_num, page in enumerate(pages):
            # Perform OCR on the page
            text = pytesseract.image_to_string(page, config="--psm 6")
            extracted_text += f"Page {page_num + 1}:\n{text}\n"

        return extracted_text
    except Exception as e:
        print(f"Error during PDF processing: {e}")
        return ""

# Function to parse the extracted text and structure it into JSON
def parse_extracted_text(text):
    # Define all the regex patterns
    patterns = {

        "name": r"Name\s*:\s*([\s\S]+?)(?=\nMobile No|Mobile No)",
        "billing_address": r"Billing Address\s*:\s*([\s\S]+?\w+-\d{6})",
        "power_supply_address":  r"Power Supply Address\s*:\s*([\s\S]+?\w+-\d{6})",
        "mobile_no": r"Mobile No\s*:\s*(\d+[X]*\d+)",
        "email_id": r"Email ID\s*:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})",
        "bill_for": r"Bill For\s*:\s*([A-Za-z]+\s*-\s*\d{4})",
        "date_of_bill": r"Date of Bill\s*:\s*([\d/]+)",
        "invoice_no": r"Invoice No\s*\.?\s*:\s*(\d+)",
        "book_folio_no": r"Book Folio No\s*\.?\s*:\s*(\d+)",
        "consumer_no": r"Consumer No\s*\.?\s*:\s*([\d\-X]+)",
        "c_a_no": r"C\.?A\.? No\s*\.?\s*:\s*(\d+)",
        "cycle": r"Cycle\s*:\s*(\d+)",
        "type_of_supply": r"Type of Supply\s*:\s*([\w]+)",
        "service_no": r"Service No\s*\.?\s*:\s*([\w\-]+)",
        "installation_no": r"Installation No\s*\.?\s*:\s*(\d+)",
        "sanctioned_load": r"Sanctioned Load\s*:\s*([\d\.]+)\s*KW",
        "security_deposit": r"Security Deposit\s*:\s*([\d\.]+)",
        "last_payment_received": r"Last Payment Received\s*₹?\s*([\d\.]+)",
        "last_payment_received_date": r"Last Payment Received Date\s*:\s*([\d/]+)",
        "bill_period": r"Bill Period\s*:\s*([\d/]+)\s*-\s*\n?([\d/]+)",
        "tariff": r"Tariff\s*:\s*(\S+)",
        "category": r"Category\s*:\s*(\S+)",
        "ward": r"Ward\s*:\s*(\S+)",
"current_bill_amount": r"Current\s*Bill\s*Amount\s*[₹]?\s*([\d.,]+)",
    "past_due": r"Past\s*Dues\s*[₹]?\s*([\d.,]+)",
    "due_date": r"Due\s*Date\s*[*]?\s*([\d/]+)",
    "bill_amount_before_due_date": r"Bill\s*Amount\s*Before\s*Due\s*Date\s*[₹]?\s*([\d.,]+)",
    "bill_amount_after_due_date": r"Bill\s*Amount\s*After\s*Due\s*Date\s*[₹]?\s*([\d.,]+)"
}
    parsed_data = {
        "customer_details": {},
        "bill_details": {},
        "bill_amounts": {}
    }


    # Apply regex patterns to the extracted text with case-insensitive flag
    for field, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)  # Added re.IGNORECASE flag
        value = match.group(1).strip() if match else None
        if field in ["name", "billing_address", "power_supply_address", "mobile_no", "email_id"]:
            parsed_data["customer_details"][field] = value
        elif field in ["current_bill_amount", "past_due", "due_date", "bill_amount_before_due_date", "bill_amount_after_due_date"]:
            parsed_data["bill_amounts"][field] = value
        else:
            parsed_data["bill_details"][field] = value

    return parsed_data


# Main function
def main(pdf_path, output_path="output.json"):
    """Extracts text from a PDF, parses it, and saves the structured data to a JSON file."""
    if not os.path.exists(pdf_path):
        print(f"PDF file not found: {pdf_path}")
        return

    # Step 1: Extract text from PDF
    print("Extracting text from PDF...")
    extracted_text = extract_text_from_pdf(pdf_path)

    if not extracted_text:
        print("No text extracted. Exiting.")
        return

    # Step 2: Parse extracted text
    print("Parsing extracted text...")
    parsed_data = parse_extracted_text(extracted_text)

    # Step 3: Save the structured data to a JSON file
    try:
        with open(output_path, "w") as json_file:
            json.dump(parsed_data, json_file, indent=4)
        print(f"Parsed data saved to {output_path}")
    except Exception as e:
        print(f"Error saving JSON: {e}")

    # Optional: Print parsed data to console for verification
    print("Extracted and parsed data:")
    print(json.dumps(parsed_data, indent=4))

# Replace with the path to your PDF file
pdf_path = "/content/604571021_2408.pdf"  # Update with the actual PDF path
main(pdf_path, "output.json")


Extracting text from PDF...
Parsing extracted text...
Parsed data saved to output.json
Extracted and parsed data:
{
    "customer_details": {
        "name": "Bill For : Aug-2024 Date of Bill: 16/08/2024 Invoice No. : 408604571021\nMRS P J & SWATI L & J P GADA Book Folio N C N .",
        "billing_address": "Type of Supply ; 3p Bill Period : 08/07/2024 -\n0-4-B, FLOOR-2,PLOT-3A,GURDEVI MANSION,K A Service No . yy.\nSUBRAMANYAM MARG, BRAHMANWADA KINGS ; 312229-X-X . 07/08/2024\nCIRCLE, MATUNGA, MUMBAI-400019",
        "power_supply_address": "Security Deposit: Ward\n\u00a9 | 0-4-B, FLOOR-2,PLOT-3A,GURDEVI MANSION,K A y Weposit = 3856.00 > FN\nS | SUBRAMANYAM MARG, BRAHMANWADA KINGS ; -\noO \u00bb\n2 | CIRCLE,MATUNGA,MUMBAI-400019",
        "mobile_no": "98XXXXX156",
        "email_id": "XXXXXXXXada@gmail.com"
    },
    "bill_details": {
        "bill_for": "Aug-2024",
        "date_of_bill": "16/08/2024",
        "invoice_no": "408604571021",
        "book_folio_no": null,
        "con

In [None]:
!apt-get install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 0s (25.3 MB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 123634 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-e

In [None]:
!pip install pytesseract pdf2image pillow opencv-python-headless numpy pandas
!pip install PyPDF2
!apt-get install -y poppler-utils tesseract-ocr


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pytesseract, pdf2image
Successfully installed pdf2image-1.17.0 pytesseract-0.3.13
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  poppler-utils tesseract-ocr tesse

In [None]:
#!pip install pytesseract pdf2image PyPDF2 pillow

import pytesseract
from pdf2image import convert_from_path
import json
import re
from PIL import Image
import os
import cv2
import numpy as np

# Set Tesseract OCR path (modify this if needed for your environment)
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update for your OS

# Function to preprocess and perform OCR on specific regions
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF using OCR."""
    try:
        pages = convert_from_path(pdf_path, dpi=300)  # Adjust DPI for better OCR accuracy
        extracted_text = ""

        for page_num, page in enumerate(pages):
            # Convert PIL Image to OpenCV Image
            open_cv_image = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)

            # Preprocessing: Convert to grayscale and apply adaptive threshold
            gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
            processed_image = cv2.adaptiveThreshold(
                gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
            )

            # Perform OCR on the preprocessed image
            text = pytesseract.image_to_string(processed_image, config="--psm 6")
            extracted_text += f"Page {page_num + 1}:\n{text}\n"

        return extracted_text
    except Exception as e:
        print(f"Error during PDF processing: {e}")
        return ""

# Function to parse the extracted text and structure it into JSON
def parse_extracted_text(text):
    # Define all the regex patterns
    patterns = {
        "name": r"Name\s*:\s*([\s\S]+?)(?=\nMobile No|Mobile No)",
        "billing_address": r"Billing Address\s*:\s*([\s\S]+?\w+-\d{6})",
        "power_supply_address": r"Power Supply Address\s*:\s*([\s\S]+?\w+-\d{6})",
        "mobile_no": r"Mobile No\s*:\s*(\d+[X]*\d+)",
        "email_id": r"Email ID\s*:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})",
        "bill_for": r"Bill For\s*:\s*([A-Za-z]+\s*-\s*\d{4})",
        "date_of_bill": r"Date of Bill\s*:\s*([\d/]+)",
        "invoice_no": r"Invoice No\s*\.?\s*:\s*(\d+)",
        "book_folio_no": r"Book Folio No\s*\.?\s*:\s*(\d+)",
        "consumer_no": r"Consumer No\s*\.?\s*:\s*([\d\-X]+)",
        "c_a_no": r"C\.?A\.? No\s*\.?\s*:\s*(\d+)",
        "cycle": r"Cycle\s*:\s*(\d+)",
        "type_of_supply": r"Type of Supply\s*:\s*([\w]+)",
        "service_no": r"Service No\s*\.?\s*:\s*([\w\-]+)",
        "installation_no": r"Installation No\s*\.?\s*:\s*(\d+)",
        "sanctioned_load": r"Sanctioned Load\s*:\s*([\d\.]+)\s*KW",
        "security_deposit": r"Security Deposit\s*:\s*([\d\.]+)",
        "last_payment_received": r"Last Payment Received\s*₹?\s*([\d\.]+)",
        "last_payment_received_date": r"Last Payment Received Date\s*:\s*([\d/]+)",
        "bill_period": r"Bill Period\s*:\s*([\d/]+)\s*-\s*\n?([\d/]+)",
        "tariff": r"Tariff\s*:\s*(\S+)",
        "category": r"Category\s*:\s*(\S+)",
        "ward": r"Ward\s*:\s*(\S+)",
        "current_bill_amount": r"Current\s*Bill\s*Amount\s*[₹]?\s*([\d.,]+)",
        "past_due": r"Past\s*Dues\s*[₹]?\s*([\d.,]+)",
        "due_date": r"Due\s*Date\s*[*]?\s*([\d/]+)",
        "bill_amount_before_due_date": r"Bill\s*Amount\s*Before\s*Due\s*Date\s*[₹]?\s*([\d.,]+)",
        "bill_amount_after_due_date": r"Bill\s*Amount\s*After\s*Due\s*Date\s*[₹]?\s*([\d.,]+)",
    }

    parsed_data = {
        "customer_details": {},
        "bill_details": {},
        "bill_amounts": {},
    }

    # Apply regex patterns to the extracted text
    for field, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        value = match.group(1).strip() if match else None
        if field in ["name", "billing_address", "power_supply_address", "mobile_no", "email_id"]:
            parsed_data["customer_details"][field] = value
        elif field in [
            "current_bill_amount",
            "past_due",
            "due_date",
            "bill_amount_before_due_date",
            "bill_amount_after_due_date",
        ]:
            parsed_data["bill_amounts"][field] = value
        else:
            parsed_data["bill_details"][field] = value

    return parsed_data


# Main function
def main(pdf_path, output_path="output.json"):
    """Extracts text from a PDF, parses it, and saves the structured data to a JSON file."""
    if not os.path.exists(pdf_path):
        print(f"PDF file not found: {pdf_path}")
        return

    # Step 1: Extract text from PDF
    print("Extracting text from PDF...")
    extracted_text = extract_text_from_pdf(pdf_path)

    if not extracted_text:
        print("No text extracted. Exiting.")
        return

    # Step 2: Parse extracted text
    print("Parsing extracted text...")
    parsed_data = parse_extracted_text(extracted_text)

    # Step 3: Save the structured data to a JSON file
    try:
        with open(output_path, "w") as json_file:
            json.dump(parsed_data, json_file, indent=4)
        print(f"Parsed data saved to {output_path}")
    except Exception as e:
        print(f"Error saving JSON: {e}")

    # Optional: Print parsed data to console for verification
    print("Extracted and parsed data:")
    print(json.dumps(parsed_data, indent=4))


# Replace with the path to your PDF file
pdf_path = "/content/604571021_2408.pdf"  # Update with the actual PDF path
main(pdf_path, "output.json")


Extracting text from PDF...
Parsing extracted text...
Parsed data saved to output.json
Extracted and parsed data:
{
    "customer_details": {
        "name": "Bill For : Aug-2024 Date of Bill : 16/08/2024 Invoice No. : 408604571021\nMRS P J & SWATIL & J P GADA Book Folio N C N x",
        "billing_address": "Type of Supply ; 3p Bill Period : 08/07/2024 -\n0-4-B, FLOOR-2,PLOT-3A,GURDEVI MANSION, K A Service No . Ly\nSUBRAMANYAM MARG,BRAHMANWADA KINGS . 312229-X-X . 07/08/2024\nCIRCLE,MATUNGA, MUMBAI-400019",
        "power_supply_address": "5 ity D it Ward\n%S | 0-4-B,FLOOR-2,PLOT-3A,GURDEVI MANSION, K A SCurTty BEPOsIC 3856.00 ar > FN\n& | SUBRAMANYAM MARG, BRAHMANWADA KINGS : :\no \u2019\n2 | CIRCLE,MATUNGA,MUMBAI-400019",
        "mobile_no": "98XXXXX156",
        "email_id": "XXXXXXXXada@gmail.com"
    },
    "bill_details": {
        "bill_for": "Aug-2024",
        "date_of_bill": "16/08/2024",
        "invoice_no": "408604571021",
        "book_folio_no": null,
        "consumer_n

In [None]:
#!pip install pytesseract pdf2image PyPDF2 pillow

import pytesseract
from pdf2image import convert_from_path
import json
import re
from PIL import Image
import os
import cv2
import numpy as np

# Set Tesseract OCR path (modify this if needed for your environment)
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update for your OS

# Function to preprocess and perform OCR on specific regions
def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF using OCR."""
    try:
        pages = convert_from_path(pdf_path, dpi=300)  # Adjust DPI for better OCR accuracy
        extracted_text = ""

        for page_num, page in enumerate(pages):
            # Convert PIL Image to OpenCV Image
            open_cv_image = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)

            # Preprocessing: Convert to grayscale and apply adaptive threshold
            gray = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
            processed_image = cv2.adaptiveThreshold(
                gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
            )

            # Perform OCR on the preprocessed image
            text = pytesseract.image_to_string(processed_image, config="--psm 6")
            extracted_text += f"Page {page_num + 1}:\n{text}\n"

        return extracted_text
    except Exception as e:
        print(f"Error during PDF processing: {e}")
        return ""

# Function to parse the extracted text and structure it into JSON
def parse_extracted_text(text):
    # Define all the regex patterns
    patterns = {
        "name": r"Name\s*:\s*([\s\S]+?)(?=\nBilling Address)",
        "billing_address": r"Billing Address\s*:\s*([\s\S]+?\w+-)()",
        "mobile_no": r"Mobile\s*/\s*Tel\.?\s*No\.\s*:\s*(\d{10})",
        "email_id": r"Email ID\s*:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})",
        "district": r"District\s*/\s*Division\s*:\s*([A-Za-z\s]+)",
        "walking_sequence": r"Walking Sequence\s*:\s*([A-Z0-9]+)",
        "bill_month": r"Bill Month\s*:\s*([A-Z]{3}-\d{2})",
        "bill_date": r"Bill Date\s*:\s*(\d{2}-\d{2}-\d{4})",
        "sanctioned_load": r"Sanctioned Load\s*:\s*([\d\.]+)\s*\(kVA\)",
        "contract_demand": r"Contract Demand\s*:\s*([\d\.]+)",
        "power_factor": r"Power Factor\s*:\s*([\d\.]+)",
        "pole_no": r"Pole No\.?\s*:\s*([\w\d]+)",
        "meter_reading_status": r"Meter Reading Status\s*:\s*([\w]+)",
        "cycle_no": r"Cycle No\.?\s*\s*(\d+)",
        "ca_no": r"CA No\.\s*(\d+)",
        "energisation_date": r"Energisation Date\s*\:\s*([\d]{2}\.[\d]{2}\.[\d]{4})",
        "meter_type": r"Meter Type\s*\:\s*([A-Z0-9]+)",
        "supply_type": r"Supply Type\s*\:\s*([A-Z]+)",
        "bill_no": r"Bill No\.\s*(\d+)",
        "bill_basis": r"Bill Basis\s*\:\s*([A-Za-z]+)",
        "od_no": r"O\.D\. No\.\s*\:\s*([A-Za-z/0-9]+)",
        "cctv_tagged_no": r"CCTV Tagged No\s*([A-Za-z]*)",  # Assuming it can be empty or have a value
        "street_light_tagged_no": r"Street Light Tagged No\s*([A-Za-z]*)",  # Assuming it can be empty or have a value
        "wifi_tagged": r"WI-FI Tagged\s*([A-Z])",
        #"bill_amount_payable": r"Bill Amount Payable\s*
        "due_date_of_payment": r"Due Date of Payment\s*\n\s*([\d]{2}-[\d]{2}-[\d]{4})"
    }

    parsed_data = {
        "customer_details": {},
        "bill_details": {},
    }

    # Apply regex patterns to the extracted text
    for field, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        value = match.group(1).strip() if match else None
        if field in ["name", "billing_address", "mobile_no", "email_id", "district"]:
            parsed_data["customer_details"][field] = value
        else:
            parsed_data["bill_details"][field] = value

    return parsed_data

# Main function
def main(pdf_path, output_path="output.json"):
    """Extracts text from a PDF, parses it, and saves the structured data to a JSON file."""
    if not os.path.exists(pdf_path):
        print(f"PDF file not found: {pdf_path}")
        return

    # Step 1: Extract text from PDF
    print("Extracting text from PDF...")
    extracted_text = extract_text_from_pdf(pdf_path)

    if not extracted_text:
        print("No text extracted. Exiting.")
        return

    # Step 2: Parse extracted text
    print("Parsing extracted text...")
    parsed_data = parse_extracted_text(extracted_text)

    # Step 3: Save the structured data to a JSON file
    try:
        with open(output_path, "w") as json_file:
            json.dump(parsed_data, json_file, indent=4)
        print(f"Parsed data saved to {output_path}")
    except Exception as e:
        print(f"Error saving JSON: {e}")

    # Optional: Print parsed data to console for verification
    print("Extracted and parsed data:")
    print(json.dumps(parsed_data, indent=4))

# Replace with the path to your PDF file
pdf_path = "/content/11304_DelhiNCR_103424503_10-Aug-24 (1).pdf"  # Update with the actual PDF path
main(pdf_path, "output.json")

Extracting text from PDF...
Parsing extracted text...
Parsed data saved to output.json
Extracted and parsed data:
{
    "customer_details": {
        "name": "DRGUPTA & ANIL KUMAR",
        "billing_address": "LT Mr. JATTU RAM&D R GUPTA SHOP Sanctioned Load :5.00 (kVA) CA No. 7103424503\n. Energisation Date :05.10.2010\nAT G-9 GF PLOT NO 5 SEC 12 KRISHNA MALL Contract Demand : Meter Type -1PSK\nDWARKA NEAR KRISHNA MALL NEW DELHI 110078 MDI : 00 Supply Type \u2018LT\nPower Factor 1.000 Bill No. 100788028110\nPole No. :DWKNNPO0SO op aa on 0432782830\n. .D. No. :\nMeter Reading Status :DL CCTV Tagged \u201cNo\nMobile/Tel. No. :9971847863 Cycle No. 13 Street Light Tagged :No\nEmail ID : drgupta81@gmail.com WEFT Tagged \u2018No\nDistrict / Division \u2014 :Dwarka Tariff Category | :Non.Domestic [ LT ] (Up to 10 kw)\nWalking Sequence :S12M15393A0AA\nBill Month :AUG-24\nBill Date :10-08-2024 Customer Care Centre No, 19123 (24x7 Toll Free\nMeter No Units Billed Consumption (Current) Billed Con

In [None]:
import pytesseract
from pdf2image import convert_from_path
import pdfplumber
import re
import json
from PIL import Image, ImageDraw
import os

# Set Tesseract OCR path (modify this if needed for your environment)
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update for your OS

# Function to extract text from a PDF table using pdfplumber
def extract_table_from_pdf(pdf_path):
    """Extracts tables from a PDF using pdfplumber."""
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            tables.extend(page.extract_tables())
    return tables

# Function to convert text to an image
def text_to_image(text):
    """Convert text to an image."""
    # Create a blank image with white background
    img = Image.new('RGB', (800, 200), color='white')
    d = ImageDraw.Draw(img)

    # Draw the text on the image
    d.text((10, 10), text, fill=(0, 0, 0))

    return img

# Function to perform OCR on each cell of the table
def ocr_table_cells(table):
    """Performs OCR on each cell of the table."""
    ocr_results = []
    for row in table:
        ocr_row = []
        for cell in row:
            if cell:  # Check if the cell is not empty
                # Convert cell text to image and perform OCR
                img = text_to_image(cell)
                ocr_text = pytesseract.image_to_string(img)
                ocr_row.append(ocr_text.strip())
            else:
                ocr_row.append(None)
        ocr_results.append(ocr_row)
    return ocr_results

# Function to parse extracted text and structure it into JSON
def parse_extracted_text(text):
    # Define all the regex patterns
    patterns = {
        "bill_for": r"Bill For\s*:\s*([A-Za-z]{3}-\d{4})",
        "date_of_bill": r"Date of Bill\s*:\s*([\d/]+)",
        "invoice_no": r"Invoice No\s*\.?\s*:\s*(\d+)",
        "book_folio_no": r"Book Folio No\s*\.?\s*:\s*(\d+)",
        "consumer_no": r"Consumer No\s*\.?\s*:\s*([\d\-X]+)",
        "c_a_no": r"C\.?A\.? No\s*\.?\s*:\s*(\d+)",
        "cycle": r"Cycle\s*:\s*([\w]+)",
        "type_of_supply": r"Type of Supply\s*:\s*([\w]+)",
        "service_no": r"Service No\s*\.?\s*:\s*([\w\-]+)",
        "installation_no": r"Installation No\s*\.?\s*:\s*(\d+)",
        "sanctioned_load": r"Sanctioned Load\s*:\s*([\d\.]+)\s*KW",
        "security_deposit": r"Security Deposit\s*:\s*([\d\.]+)",
        "last_payment_received": r"Last Payment Received\s*₹?\s*([\d\.]+)",
        "last_payment_received_date": r"Last Payment Received Date\s*:\s*([\d/]+)",
        "bill_period_from": r"Bill Period From\s*:\s*([\d/]+)",
        "bill_period_to": r"Bill Period To\s*:\s*([\d/]+)",
        "tariff": r"Tariff\s*:\s*(\S+)",
        "category": r"Category\s*:\s*(\S+)",
        "ward": r"Ward\s*:\s*(\S+)",
        "current_bill_amount": r"Current Bill Amount\s*[₹]?\s*([\d.,]+)",
        "past_due": r"Past Dues\s*[₹]?\s*([\d.,]+)",
        "due_date": r"Due Date\s*:\s*([\d/]+)",
        "bill_amount_before_due_date": r"Bill Amount Before Due Date\s*[₹]?\s*([\d.,]+)",
        "bill_amount_after_due_date": r"Bill Amount After Due Date\s*[₹]?\s*([\d., ]+)"
    }

    extracted_data = {
        "bill_details": {},
        "bill_amount": {}
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            extracted_data["bill_details"][key] = match.group(1).strip()

    return extracted_data

# Main function to process the PDF and extract data
def process_pdf(pdf_path):
    """Main function to process the PDF and extract data."""
    tables = extract_table_from_pdf(pdf_path)
    all_ocr_results = []

    for table in tables:
        ocr_results = ocr_table_cells(table)
        all_ocr_results.extend(ocr_results)

    # Combine all OCR results into a single string for parsing
    combined_text = "\n".join(["\t".join(filter(None, row)) for row in all_ocr_results])
    parsed_data = parse_extracted_text(combined_text)

    return parsed_data

# Example usage
pdf_path = "/content/'bill_202410_604571021_en' (1) (1).pdf" # Update with your PDF file path
extracted_data = process_pdf(pdf_path)

# Print the extracted data in JSON format
print(json.dumps(extracted_data, indent=4))

{
    "bill_details": {
        "bill_for": "Cct-2024",
        "invoice_no": "410604571021",
        "cycle": "09",
        "type_of_supply": "3P",
        "service_no": "312228-X-X08",
        "security_deposit": "3868",
        "category": "RESIDENTIAL.",
        "ward": "FN",
        "bill_amount_after_due_date": "2131.190.11"
    },
    "bill_amount": {}
}


In [None]:
#Cropping the img and then performing OCR
import pdf2image
import cv2
import numpy as np
import pytesseract
from PIL import Image
import re
import json
from typing import List, Dict, Any, Tuple
import os


pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update for your OS

class PDFTableExtractor:


    def convert_pdf_to_images(self, pdf_path: str, dpi: int = 300) -> List[Image.Image]:
        """
        Convert PDF to list of PIL Images
        """
        try:
            return pdf2image.convert_from_path(pdf_path, dpi=dpi)
        except Exception as e:
            print(f"Error converting PDF to images: {str(e)}")
            return []

    def preprocess_image(self, image: np.ndarray) -> np.ndarray:
        """
        Preprocess image for better table detection
        """
        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Binary threshold
        _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

        # Dilate to connect text
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
        dilated = cv2.dilate(binary, kernel, iterations=3)

        return dilated

    def detect_tables(self, image: np.ndarray) -> List[Tuple[int, int, int, int]]:
        """
        Detect tables in the image and return their coordinates
        """
        # Find contours
        contours, _ = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        # Filter contours based on area
        min_area = image.shape[0] * image.shape[1] * 0.01  # 1% of image area
        table_contours = [cnt for cnt in contours if cv2.contourArea(cnt) > min_area]

        # Get bounding rectangles
        tables = []
        for cnt in table_contours:
            x, y, w, h = cv2.boundingRect(cnt)
            tables.append((x, y, x+w, y+h))

        return tables

    def crop_tables(self, image: Image.Image, coordinates: List[Tuple[int, int, int, int]]) -> List[Image.Image]:
        """
        Crop tables from the image using coordinates
        """
        return [image.crop(coord) for coord in coordinates]

    def perform_ocr(self, image: Image.Image) -> str:
        """
        Perform OCR on an image
        """
        try:
            # Configure OCR for better table recognition
            custom_config = r'--oem 3 --psm 6'
            return pytesseract.image_to_string(image, config=custom_config)
        except Exception as e:
            print(f"OCR Error: {str(e)}")
            return ""

    def parse_table_text(self, text: str) -> Dict[str, Any]:
        """
        Parse extracted text into structured data
        """
        # Define patterns for different sections
        patterns = {
            'bill_details': {
                'bill_for': r'Bill For\s*:\s*([A-Za-z]+-\d{4})',
                'date_of_bill': r'Date of Bill\s*:\s*(\d{2}/\d{2}/\d{4})',
                'invoice_no': r'Invoice No\.\s*:\s*(\d+)',
                'bill_period': r'Bill Period\s*:\s*(\d{2}/\d{2}/\d{4})\s*-\s*(?:\n|\r\n)*\s*(\d{2}/\d{2}/\d{4})',
                  "book_folio_no": r"Book Folio No\s*\.?\s*:\s*(\d+)",
        "consumer_no": r"Consumer No\s*\.?\s*:\s*([\d\-X]+)",
        "c_a_no": r"C\.?A\.? No\s*\.?\s*:\s*(\d+)",
        "cycle": r"Cycle\s*:\s*([\w]+)",
        "type_of_supply": r"Type of Supply\s*:\s*([\w]+)",
        "service_no": r"Service No\s*\.?\s*:\s*([\w\-]+)",
        "installation_no": r"Installation No\s*\.?\s*:\s*(\d+)",
        "sanctioned_load": r"Sanctioned Load\s*:\s*([\d\.]+)\s*KW",
        "security_deposit": r"Security Deposit\s*:\s*([\d\.]+)",
        "last_payment_received": r"Last Payment Received\s*₹?\s*([\d\.]+)",
        "last_payment_received_date": r"Last Payment Received Date\s*:\s*([\d/]+)",
        "bill_period_from": r"Bill Period From\s*:\s*([\d/]+)",
        "bill_period_to": r"Bill Period To\s*:\s*([\d/]+)",
        "tariff": r"Tariff\s*:\s*(\S+)",
        "category": r"Category\s*:\s*(\S+)",
        "ward": r"Ward\s*:\s*(\S+)",
            },
            'customer_details': {
                "name": r"Name\s*:\s*([\s\S]+?)(?=\nMobile No|Mobile No)",
        "billing_address": r"Billing Address\s*:\s*([\s\S]+?\w+-\d{6})",
        "power_supply_address": r"Power Supply Address\s*:\s*([\s\S]+?\w+-\d{6})",
        "mobile_no": r"Mobile No\s*:\s*(\d+[X]*\d+)",
        "email_id": r"Email ID\s*:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})",
            }
        }

        parsed_data = {}
        for section, section_patterns in patterns.items():
            parsed_data[section] = {}
            for field, pattern in section_patterns.items():
                match = re.search(pattern, text)
                if match:
                    if field == 'bill_period':
                        parsed_data[section]['bill_period_from'] = match.group(1)
                        parsed_data[section]['bill_period_to'] = match.group(2)
                    else:
                        parsed_data[section][field] = match.group(1)

        return parsed_data

    def process_pdf(self, pdf_path: str, output_path: str = None) -> Dict[str, Any]:
        """
        Process PDF and extract table data
        """
        # Convert PDF to images
        images = self.convert_pdf_to_images(pdf_path)
        if not images:
            return {}

        all_data = []
        for idx, pil_image in enumerate(images):
            # Convert PIL Image to OpenCV format
            opencv_image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)

            # Preprocess image
            processed = self.preprocess_image(opencv_image)

            # Detect tables
            table_coordinates = self.detect_tables(processed)

            # Crop tables
            table_images = self.crop_tables(pil_image, table_coordinates)

            # Process each table
            for table_idx, table_image in enumerate(table_images):
                # Perform OCR
                table_text = self.perform_ocr(table_image)

                # Parse text
                parsed_data = self.parse_table_text(table_text)
                all_data.append(parsed_data)

                # Save cropped table image for debugging
                if output_path:
                    table_image.save(f"{output_path}/table_{idx}_{table_idx}.png")

        # Combine all extracted data
        combined_data = self.combine_data(all_data)

        # Save to JSON if output path provided
        if output_path:
            with open(f"{output_path}/extracted_data.json", 'w', encoding='utf-8') as f:
                json.dump(combined_data, f, indent=2, ensure_ascii=False)

        return combined_data

    def combine_data(self, data_list: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Combine data from multiple tables into a single structure
        """
        combined = {
            "bill_details": {},
            "customer_details": {},
        }

        for data in data_list:
            for section in data:
                combined[section].update(data[section])

        return combined

# Example usage
if __name__ == "__main__":
    # Initialize extractor
    extractor = PDFTableExtractor()

    # Process PDF
    pdf_path = "/content/'bill_202410_604571021_en' (1) (1).pdf"
    output_dir = "output"

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Extract data
    result = extractor.process_pdf(pdf_path, output_dir)

    # Print results
    print(json.dumps(result, indent=2))

Error converting PDF to images: Unable to get page count.
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't find trailer dictionary
Syntax Error: Couldn't read xref table

{}


In [None]:

import pytesseract
import cv2
import re
import json
import numpy as np
from typing import Dict, Any, Union, Tuple

class ElectricityBillImageParser:
    def __init__(self, image_path: str):
        self.image_path = image_path
        # Define all regex patterns as class attributes
        self.patterns = {
            # Customer Details
            'name': r"Name\s*:\s*([\s\S]+)(?=\nMobile No|Mobile No)",
            'billing_address': r"Billing Address\s*:\s*([\s\S]+?\w+-\d{6})",
            'power_supply_address': r"Power Supply Address\s*:\s*([\s\S]+?\w+-\d{6})",
            'mobile_no': r"Mobile No\s*:\s*(\d+[X]*\d+)",
            'email_id': r"Email ID\s*:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})",

            # Bill Details
            'book_folio_no': r"Book Folio No\s*\.?\s*:\s*(\d+)",
            'consumer_no': r"Consumer No\s*\.?\s*:\s*([\d\-X]+)",
            'c_a_no': r"C\.?A\.? No\s*\.?\s*:\s*(\d+)",
            'cycle': r"Cycle\s*:\s*(\d+)",
            'type_of_supply': r"Type of Supply\s*:\s*([\w]+)",
            'service_no': r"Service No\s*\.?\s*:\s*([\w\-]+)",
            'installation_no': r"Installation No\s*\.?\s*:\s*(\d+)",
            'sanctioned_load': r"Sanctioned Load\s*:\s*([\d\.]+)\s*KW",
            'security_deposit': r"Security Deposit\s*:\s*([\d\.]+)",
            'last_payment_received': r"Last Payment Received\s*₹?\s*([\d\.]+)",
            'last_payment_received_date': r"Last Payment Received Date\s*:\s*([\d/]+)",
            'tariff': r"Tariff\s*:\s*(\S+)",
            'category': r"Category\s*:\s*(\S+)",
            'ward': r"Ward\s*:\s*(\S+)",

            # Bill Amounts
            'current_bill_amount': r"Current\s*Bill\s*Amount\s*[₹]?\s*([\d.,]+)",
            'past_due': r"Past\s*Dues\s*[₹]?\s*([\d.,]+)",
            'due_date': r"Due\s*Date\s*[*]?\s*([\d/]+)",
            'bill_amount_before_due_date': r"Bill\s*Amount\s*Before\s*Due\s*Date\s*[₹]?\s*([\d.,]+)",
            'bill_amount_after_due_date': r"Bill\s*Amount\s*After\s*Due\s*Date\s*[₹]?\s*([\d.,]+)"
        }
        self.text = self._extract_text()

    def _preprocess_image(self, image: np.ndarray) -> np.ndarray:
        """Preprocess the image to improve OCR accuracy"""
        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Apply thresholding to get black and white image
        _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        # Remove noise
        denoised = cv2.fastNlMeansDenoising(binary)

        # Increase image size for better recognition
        enlarged = cv2.resize(denoised, None, fx=1.5, fy=1.5, interpolation=cv2.INTER_CUBIC)

        return enlarged

    def _extract_text(self) -> str:
        """Extract text from image using OCR"""
        image = cv2.imread(self.image_path)
        if image is None:
            raise ValueError(f"Could not read image from {self.image_path}")

        processed_image = self._preprocess_image(image)
        custom_config = r'--oem 3 --psm 6'
        return pytesseract.image_to_string(processed_image, config=custom_config)

    def _find_pattern(self, pattern: str, default: str = '') -> str:
        """Find pattern in text with error handling"""
        try:
            match = re.search(pattern, self.text, re.IGNORECASE | re.MULTILINE)
            return match.group(1).strip() if match else default
        except Exception:
            return default

    def _extract_bill_period(self) -> Tuple[str, str]:
        """Extract bill period from and to dates"""
        pattern = r"Bill Period\s*:\s*([\d/]+)\s*-\s*\n?([\d/]+)"
        match = re.search(pattern, self.text, re.IGNORECASE)
        if match:
            return match.group(1), match.group(2)
        return '', ''

    def get_customer_details(self) -> dict:
        return {
            "name": self._find_pattern(self.patterns['name']),
            "mobile_no": self._find_pattern(self.patterns['mobile_no']),
            "email_id": self._find_pattern(self.patterns['email_id']),
            "billing_address": self._find_pattern(self.patterns['billing_address']),
            "power_supply_address": self._find_pattern(self.patterns['power_supply_address'])
        }

    def get_bill_details(self) -> dict:
        from_date, to_date = self._extract_bill_period()
        return {
            "book_folio_no": self._find_pattern(self.patterns['book_folio_no']),
            "consumer_no": self._find_pattern(self.patterns['consumer_no']),
            "c_a_no": self._find_pattern(self.patterns['c_a_no']),
            "cycle": self._find_pattern(self.patterns['cycle']),
            "type_of_supply": self._find_pattern(self.patterns['type_of_supply']),
            "service_no": self._find_pattern(self.patterns['service_no']),
            "installation_no": self._find_pattern(self.patterns['installation_no']),
            "sanctioned_load": self._find_pattern(self.patterns['sanctioned_load']),
            "security_deposit": self._find_pattern(self.patterns['security_deposit']),
            "last_payment_received": self._find_pattern(self.patterns['last_payment_received']),
            "last_payment_received_date": self._find_pattern(self.patterns['last_payment_received_date']),
            "bill_period_from": from_date,
            "bill_period_to": to_date,
            "tariff": self._find_pattern(self.patterns['tariff']),
            "category": self._find_pattern(self.patterns['category']),
            "ward": self._find_pattern(self.patterns['ward'])
        }

    def get_bill_amounts(self) -> dict:
        return {
            "current_bill_amount": self._find_pattern(self.patterns['current_bill_amount']),
            "past_due": self._find_pattern(self.patterns['past_due']),
            "due_date": self._find_pattern(self.patterns['due_date']),
            "bill_amount_before_due_date": self._find_pattern(self.patterns['bill_amount_before_due_date']),
            "bill_amount_after_due_date": self._find_pattern(self.patterns['bill_amount_after_due_date'])
        }

    def extract_all_data(self) -> dict:
        return {
            "bill_details": self.get_bill_details(),
            "bill_amounts": self.get_bill_amounts(),
            "customer_details": self.get_customer_details()
        }

def parse_electricity_bill_image(image_path: str) -> dict:
    """Parse electricity bill from an image file"""
    parser = ElectricityBillImageParser(image_path)
    return parser.extract_all_data()

# Example usage
if __name__ == "__main__":
    try:
        result = parse_electricity_bill_image('/content/Screenshot 2025-01-07 123058.png')
        print(json.dumps(result, indent=2))
    except Exception as e:
        print(f"Error processing bill: {str(e)}")

{
  "bill_details": {
    "book_folio_no": "",
    "consumer_no": "",
    "c_a_no": "",
    "cycle": "",
    "type_of_supply": "",
    "service_no": "",
    "installation_no": "",
    "sanctioned_load": "",
    "security_deposit": "3856.00",
    "last_payment_received": "",
    "last_payment_received_date": "",
    "bill_period_from": "",
    "bill_period_to": "",
    "tariff": "",
    "category": "",
    "ward": ""
  },
  "bill_amounts": {
    "current_bill_amount": "",
    "past_due": "",
    "due_date": "",
    "bill_amount_before_due_date": "",
    "bill_amount_after_due_date": ""
  },
  "customer_details": {
    "name": "",
    "mobile_no": "",
    "email_id": "",
    "billing_address": "",
    "power_supply_address": ""
  }
}


In [None]:
import pytesseract
from pdf2image import convert_from_path
import json
import re
import cv2
import numpy as np
import os

# Set Tesseract OCR path (modify this if needed for your environment)
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update for your OS

# Function to convert PDF to images
def pdf_to_images(pdf_path):
    """Convert PDF to images."""
    return convert_from_path(pdf_path, dpi=300)

# Function to crop the table from the image
def crop_table(image, crop_coords):
    """Crop the table from the image using specified coordinates."""
    # Convert PIL Image to OpenCV Image
    open_cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    # Crop the image using the provided coordinates
    x, y, w, h = crop_coords
    cropped_image = open_cv_image[y:y+h, x:x+w]
    return cropped_image

# Function to perform OCR on the cropped image
def perform_ocr(cropped_image):
    """Perform OCR on the cropped image."""
    # Convert to grayscale
    gray_image = cv2.cvtColor(cropped_image, cv2.COLOR_BGR2GRAY)
    # Apply adaptive thresholding
    processed_image = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    # Perform OCR
    text = pytesseract.image_to_string(processed_image, config="--psm 6")
    return text

# Function to parse the extracted text and structure it into JSON
def parse_extracted_text(text):
    # Define all the regex patterns
    patterns = {
        "name": r"Name\s*:\s*([\s\S]+?)(?=\nMobile No|Mobile No)",
        "billing_address": r"Billing Address\s*:\s*([\s\S]+?\w+-\d{6})",
        "power_supply_address": r"Power Supply Address\s*:\s*([\s\S]+?\w+-\d{6})",
        "mobile_no": r"Mobile No\s*:\s*(\d+[X]*\d+)",
        "email_id": r"Email ID\s*:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})",
        "bill_for": r"Bill For\s*:\s*([A-Za-z]+\s*-\s*\d{4})",
        "date_of_bill": r"Date of Bill\s*:\s*([\d/]+)",
        "invoice_no": r"Invoice No\s*\.?\s*:\s*(\d+)",
        "book_folio_no": r"Book Folio No\s*\.?\s*:\s*(\d+)",
        "consumer_no": r"Consumer No\s*\.?\s*:\s*([\d\-X]+)",
        "c_a_no": r"C\.?A\.? No\s*\.?\s*:\s*(\d+)",
        "cycle": r"Cycle\s*:\s*(\d+)",
        "type_of_supply": r"Type of Supply\s*:\s*([\w]+)",
        "service_no": r"Service No\s*\.?\s*:\s*([\w\-]+)",
        "installation_no": r"Installation No\s*\.?\s*:\s*(\d+)",
        "sanctioned_load": r"Sanctioned Load\s*:\s*([\d\.]+)\s*KW",
        "security_deposit": r"Security Deposit\s*:\s*([\d\.]+)",
        "last_payment_received": r"Last Payment Received\s*₹?\s*([\d\.]+)",
        "last_payment_received_date": r"Last Payment Received Date\s*:\s*([\d/]+)",
        "bill_period": r"Bill Period\s*:\s*([\d/]+)\s*-\n?([\d/]+)",
        "tariff": r"Tariff\s*:\s*(\S+)",
        "category": r"Category\s*:\s*(\S+)",
        "ward": r"Ward\s*:\s*(\S+)",
        "current_bill_amount": r"Current\s*Bill\s*Amount\s*[₹]?\s*([\d.,]+)",
        "past_due": r"Past\s*Dues\s*[₹]?\s*([\d.,]+)",
        "due_date": r"Due\s*Date\s*[*]?\s*([\d/]+)",
        "bill_amount_before_due_date": r"Bill\s*Amount\s*Before\s*Due\s*Date\s*[₹]?\s*([\d.,]+)",
        "bill_amount_after_due_date": r"Bill\s*Amount\s*After\s*Due\s*Date\s*[₹]?\s*([\d.,]+)",
    }

    parsed_data = {
        "customer_details": {},
        "bill_details": {},
        "bill_amounts": {},
    }

    # Apply regex patterns to the extracted text
    for field, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        value = match.group(1).strip() if match else None
        if field in ["name", "billing_address", "power_supply_address", "mobile_no", "email_id"]:
            parsed_data["customer_details"][field] = value
        elif field in [
            "current_bill_amount",
            "past_due",
            "due_date",
            "bill_amount_before_due_date",
            "bill_amount_after_due_date",
        ]:
            parsed_data["bill_amounts"][field] = value
        else:
            parsed_data["bill_details"][field] = value

    return parsed_data

# Main function
def main(pdf_path, output_path="output.json", crop_coords=(0, 0, 800, 600)):
    """Extracts text from a PDF, crops the table, performs OCR, parses it, and saves the structured data to a JSON file."""
    if not os.path.exists(pdf_path):
        print(f"PDF file not found: {pdf_path}")
        return

    # Step 1: Convert PDF to images
    print("Converting PDF to images...")
    images = pdf_to_images(pdf_path)

    parsed_data = []

    # Step 2: Process each image
    for page_num, image in enumerate(images):
        print(f"Processing page {page_num + 1}...")
        # Crop the table from the image
        cropped_image = crop_table(image, crop_coords)
        # Perform OCR on the cropped image
        extracted_text = perform_ocr(cropped_image)
        # Parse the extracted text
        parsed_page_data = parse_extracted_text(extracted_text)
        parsed_data.append(parsed_page_data)

    # Step 3: Save the structured data to a JSON file
    try:
        with open(output_path, "w") as json_file:
            json.dump(parsed_data, json_file, indent=4)
        print(f"Parsed data saved to {output_path}")
    except Exception as e:
        print(f"Error saving JSON: {e}")

    # Optional: Print parsed data to console for verification
    print("Extracted and parsed data:")
    print(json.dumps(parsed_data, indent=4))

# Replace with the path to your PDF file and crop coordinates
pdf_path = "/content/'bill_202410_604571021_en' (1) (1).pdf"  # Update with the actual PDF path
crop_coords = (50, 50, 800, 600)  # Update with the actual coordinates for cropping
main(pdf_path, "output.json", crop_coords)

Converting PDF to images...
Processing page 1...
Processing page 2...
Parsed data saved to output.json
Extracted and parsed data:
[
    {
        "customer_details": {
            "name": "MRS P J & SWATI L & J P GADA",
            "billing_address": null,
            "power_supply_address": null,
            "mobile_no": "98XXXXX156",
            "email_id": "XXXXXXXXada@gmail.com"
        },
        "bill_details": {
            "bill_for": null,
            "date_of_bill": null,
            "invoice_no": null,
            "book_folio_no": null,
            "consumer_no": null,
            "c_a_no": null,
            "cycle": null,
            "type_of_supply": null,
            "service_no": null,
            "installation_no": null,
            "sanctioned_load": null,
            "security_deposit": null,
            "last_payment_received": null,
            "last_payment_received_date": null,
            "bill_period": null,
            "tariff": null,
            "category": nu

In [None]:
import pytesseract
from pdf2image import convert_from_path
import json
import re
import cv2
import numpy as np
import os

# Set Tesseract OCR path (modify this if needed for your environment)
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Update for your OS

# Function to convert PDF to images
def pdf_to_images(pdf_path):
    """Convert PDF to images."""
    return convert_from_path(pdf_path, dpi=300)

# Function to crop the table from the image
def crop_table(image, crop_coords):
    """Crop the table from the image using specified coordinates."""
    # Convert PIL Image to OpenCV Image
    open_cv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    # Crop the image using the provided coordinates
    x, y, w, h = crop_coords
    cropped_image = open_cv_image[y:y+h, x:x+w]
    return cropped_image

# Function to perform OCR on the cropped image
def perform_ocr(cropped_image):
    """Perform OCR on the cropped image."""
    # Convert to grayscale
    gray_image = cv2.cvtColor(cropped_image, cv2.COLOR_BGR2GRAY)
    # Apply adaptive thresholding
    processed_image = cv2.adaptiveThreshold(gray_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    # Perform OCR
    text = pytesseract.image_to_string(processed_image, config="--psm 6")
    return text

# Function to parse the extracted text and structure it into JSON
def parse_extracted_text(text):
    # Define all the regex patterns
    patterns = {
        "name": r"Name\s*:\s*([\s\S]+?)(?=\nMobile No|Mobile No)",
        "billing_address": r"Billing Address\s*:\s*([\s\S]+?\w+-\d{6})",
        "power_supply_address": r"Power Supply Address\s*:\s*([\s\S]+?\w+-\d{6})",
        "mobile_no": r"Mobile No\s*:\s*(\d+[X]*\d+)",
        "email_id": r"Email ID\s*:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})",
        "bill_for": r"Bill For\s*:\s*([A-Za-z]+\s*-\s*\d{4})",
        "date_of_bill": r"Date of Bill\s*:\s*([\d/]+)",
        "invoice_no": r"Invoice No\s*\.?\s*:\s*(\d+)",
        "book_folio_no": r"Book Folio No\s*\.?\s*:\s*(\d+)",
        "consumer_no": r"Consumer No\s*\.?\s*:\s*([\d\-X]+)",
        "c_a_no": r"C\.?A\.? No\s*\.?\s*:\s*(\d+)",
        "cycle": r"Cycle\s*:\s*(\d+)",
        "type_of_supply": r"Type of Supply\s*:\s*([\w]+)",
        "service_no": r"Service No\s*\.?\s*:\s*([\w\-]+)",
        "installation_no": r"Installation No\s*\.?\s*:\s*(\d+)",
        "sanctioned_load": r"Sanctioned Load\s*:\s*([\d\.]+)\s*KW",
        "security_deposit": r"Security Deposit\s*:\s*([\d\.]+)",
        "last_payment_received": r"Last Payment Received\s*₹?\s*([\d\.]+)",
        "last_payment_received_date": r"Last Payment Received Date\s*:\s*([\d/]+)",
        "bill_period": r"Bill Period\s*:\s*([\d/]+)\s*-\s*\n?([\d/]+)",
        "tariff": r"Tariff\s*:\s*(\S+)",
        "category": r"Category\s*:\s*(\S+)",
        "ward": r"Ward\s*:\s*(\S+)",
        "current_bill_amount": r"Current\s*Bill\s*Amount\s*[₹]?\s*([\d.,]+)",
        "past_due": r"Past\s*Dues\s*[₹]?\s*([\d.,]+)",
        "due_date": r"Due\s*Date\s*[*]?\s*([\d/]+)",
        "bill_amount_before_due_date": r"Bill\s*Amount\s*Before\s*Due\s*Date\s*[₹]?\s*([\d.,]+)",
        "bill_amount_after_due_date": r"Bill\s*Amount\s*After\s*Due\s*Date\s*[₹]?\s*([\d.,]+)",
    }

    parsed_data = {
        "customer_details": {},
        "bill_details": {},
        "bill_amounts": {},
    }

    # Apply regex patterns to the extracted text
    for field, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        value = match.group(1).strip() if match else None
        if field in ["name", "billing_address", "power_supply_address", "mobile_no", "email_id"]:
            parsed_data["customer_details"][field] = value
        elif field in [
            "current_bill_amount",
            "past_due",
            "due_date",
            "bill_amount_before_due_date",
            "bill_amount_after_due_date",
        ]:
            parsed_data["bill_amounts"][field] = value
        else:
            parsed_data["bill_details"][field] = value

    return parsed_data

# Main function
def main(pdf_path, output_path="output.json"):
    """Extracts text from a PDF, crops multiple tables, performs OCR, parses it, and saves the structured data to a JSON file."""
    if not os.path.exists(pdf_path):
        print(f"PDF file not found: {pdf_path}")
        return

    # Step 1: Convert PDF to images
    print("Converting PDF to images...")
    images = pdf_to_images(pdf_path)

    parsed_data = []

    # Define coordinates for all tables
    table_coordinates = [
        (25, 24, 931, 969),  # Table 1
          # Table 5
    ]

    # Step 2: Process each image
    for page_num, image in enumerate(images):
        print(f"Processing page {page_num + 1}...")
        for coords in table_coordinates:
            # Crop the table from the image
            cropped_image = crop_table(image, coords)
            # Perform OCR on the cropped image
            extracted_text = perform_ocr(cropped_image)
            # Parse the extracted text
            parsed_page_data = parse_extracted_text(extracted_text)
            parsed_data.append(parsed_page_data)

    # Step 3: Save the structured data to a JSON file
    try:
        with open(output_path, "w") as json_file:
            json.dump(parsed_data, json_file, indent=4)
        print(f"Parsed data saved to {output_path}")
    except Exception as e:
        print(f"Error saving JSON: {e}")

    # Optional: Print parsed data to console for verification
    print("Extracted and parsed data:")
    print(json.dumps(parsed_data, indent=4))

# Replace with the path to your PDF file
pdf_path = "/content/604571021_2408.pdf"  # Update with the actual PDF path
main(pdf_path, "output.json")

Converting PDF to images...
Processing page 1...
Processing page 2...
Parsed data saved to output.json
Extracted and parsed data:
[
    {
        "customer_details": {
            "name": "MRS P J & SWATI L & J P GADA",
            "billing_address": "0-4-B, FLOOR-2,PLOT-3A, GURDEVI MANSION, K A\nSUBRAMANYAM MARG, BRAHMANWADA KINGS\nCIRCLE, MATUNGA, MUMBAI-400019",
            "power_supply_address": "S 0-4-B, FLOOR-2,PLOT-3A, GURDEVI MANSION, K A\n5 | SUBRAMANYAM MARG,BRAHMANWADA KINGS\n= CIRCLE, MATUNGA, MUMBAI-400019",
            "mobile_no": "98XXXXX156",
            "email_id": "XXXXXXXXada@gmail.com"
        },
        "bill_details": {
            "bill_for": null,
            "date_of_bill": null,
            "invoice_no": null,
            "book_folio_no": null,
            "consumer_no": null,
            "c_a_no": null,
            "cycle": null,
            "type_of_supply": null,
            "service_no": null,
            "installation_no": null,
            "sanctioned_