<a href="https://colab.research.google.com/github/soan12345/OCR/blob/main/MP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pytesseract pdf2image opencv-python pillow

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pytesseract, pdf2image
Successfully installed pdf2image-1.17.0 pytesseract-0.3.13


In [3]:
!apt-get install -y poppler-utils tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  poppler-utils tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 4 newly installed, 0 to remove and 21 not upgraded.
Need to get 5,002 kB of archives.
After this operation, 16.3 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.6 [186 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 5,002 kB in 1s (4,069 kB/s)
Selecting previously unselected package popp

In [46]:
import pytesseract
from pdf2image import convert_from_path
import re
from typing import Dict, Any
import logging

class BillExtractor:
    def __init__(self):
        self.logger = logging.getLogger(__name__)
        logging.basicConfig(level=logging.INFO)


        self.field_patterns = {
            'Consumer_ID': r'Service\s*Number[:\s]*([A-Z0-9]+)',
            'Meter_No_KWH': r'Meter\s*Serial\s*No[:\s]*([A-Z0-9]+)',
            'Old_Account_No': r'Old\s*Service\s*Number[:\s]*([A-Z0-9-]+)',
            #'Service Number':r'Service Number\s*([A-Z0-9-]+)',
            'Phase': r'Phase\s*Given[:\s]*([A-Z]+)',
            'Sanctioned_Load': r'Load\s*Sanctioned[:\s]*([0-9.]+\s*KW)',
            'Meter_Status': r'Reading\s*Type[:\s]*([A-Z_]+)',
            'Tariff': r'Tariff\s*Class[:\s]*([^\n]+)',
            "mdi": r"Maximum Demand\s*([\d.]+)",
            'Board_Name': r'(Madhya\s*Pradesh\s*Madhya\s*Kshetra\s*Vidyut\s*Vitran\s*Company\s*Ltd\.)',
            'Division': r'Division\s*Name[:\s]*([^\n]+?)(?=\s*(?:Tariff Class|$))',
            'Name_On_Bill': r'Mr\.\s*/\s*Ms\.\s*([^\n]+?)(?=\s*Bill\s*Demand|$)',
             "Address": r"Address[:\s]*([^\n]+)",
            'Bill_No': r'Bill\s*Number[:\s]*([A-Z0-9]+)',
            'Bill_Issue_Date': r'Bill\s*Date[:\s]*(\d{2}-[A-Za-z]+-\d{4})',
            'Bill_Due_Date': r'Bill\s*Payment\s*last\s*Date[:\s]*(\d{2}-[A-Za-z]+-\d{4})',
            'Opening_Reading_KWH': r'Previous\s*Reading[:\s]*(\d+[\.,]\d+)',
            'Closing_Reading_KWH': r'Current\s*Reading[:\s]*(\d+[\.,]\d+)',
            'Current_Reading_Date_KWH': r'Current\s*Read\s*Date[:\s]*(\d{2}-[A-Za-z]+-\d{4})',
            'Meter_Consumption_Unit_KWH': r'Units\s*consumed[:\s]*(\d+)',
            'Fixed_Charges': r'Fixed\s*Charge[:\s]*(\d+\.\d+)',
            'Energy_Charges': r'Energy\s*Charges[:\s]*(\d+\.\d+)',
            'Current_Bill_Amount': r'Current\s*Month\s*Bill[:\s]*(\d+\.\d+)',
            'Amount_Before_Due_Date': r'Total\s*Amount\s*Payable\s*On\s*Due\s*Date[:\s]*(\d+\.\d+)',
            'Amount_After_Due_Date': r'Total\s*Amount\s*Payable\s*After\s*Due\s*Date[:\s]*(\d+\.\d+)',
            'Last_Amount_Paid': r'Amount\s*Paid[:\s]*(\d+)',
            'Power_Factor': r'P\.F\.[:\s]*(\d+\.\d+)',
            'Security_Deposit_Paid': r'Security\s*Amount\s*Deposited[:\s]*(\d+\.\d+)',
        }

    def extract_from_pdf(self, pdf_path: str, debug: bool = True) -> Dict[str, Any]:
        """
        Extract information from PDF bill using OCR with debug mode
        """
        try:
            # Convert PDF to images
            images = convert_from_path(pdf_path)

            # Initialize results dictionary
            results = {field: None for field in self.field_patterns.keys()}

            # Process each page
            for page_num, image in enumerate(images, 1):
                # Performing OCR
                text = pytesseract.image_to_string(image)

                if debug:
                    # Save extracted text for debugging
                    with open(f'extracted_text_page_{page_num}.txt', 'w', encoding='utf-8') as f:
                        f.write(text)
                    self.logger.info(f"Saved extracted text from page {page_num}")

                # Extract fields using patterns
                for field, pattern in self.field_patterns.items():
                    if results[field] is None:
                        match = re.search(pattern, text)
                        if match:
                            results[field] = match.group(1).strip()
                            if debug:
                                self.logger.info(f"Found {field}: {results[field]}")


            results = {k: v.replace('|', '').strip() if v else v for k, v in results.items()}

            # Log missing fields in debug mode
            if debug:
                missing_fields = [field for field, value in results.items() if value is None]
                if missing_fields:
                    self.logger.warning(f"Failed to extract: {', '.join(missing_fields)}")

            return results

        except Exception as e:
            self.logger.error(f"Error processing PDF: {str(e)}")
            return {"error": str(e)}

def main():

    extractor = BillExtractor()


    results = extractor.extract_from_pdf("537_N2945012931_2025-01-10.pdf", debug=True)

    # Print results
    print("\nExtracted Fields:")
    print("-" * 50)
    for field, value in results.items():
        if value is not None:
            print(f"{field}: {value}")
    print("-" * 50)

if __name__ == "__main__":
    main()




Extracted Fields:
--------------------------------------------------
Consumer_ID: VI31
Meter_No_KWH: SECUREMPM10732
Old_Account_No: VI31-12-7125427470
Phase: THREE
Sanctioned_Load: 20.0 KW
Meter_Status: NORMAL
Tariff: LV2 [LV2.2]
mdi: 4.32
Board_Name: Madhya Pradesh Madhya Kshetra Vidyut Vitran Company Ltd.
Division: O&M Vidisha
Name_On_Bill: SHRI SANJEEV JAIN C/O YES BANK
Address: S/O SHRI SURESH JAIN, ARIHANT BIHAR COLONYPH- Total Bill Amount On Due Date (In Rs.) 12647.00
Bill_No: JAN25N001915722
Bill_Issue_Date: 10-Jan-2025
Current_Reading_Date_KWH: 01-Jan-2025
Meter_Consumption_Unit_KWH: 1
Fixed_Charges: 5328.00
Energy_Charges: 7541.70
Current_Bill_Amount: 13059.97
Amount_Before_Due_Date: 12647.00
Amount_After_Due_Date: 12805.00
Security_Deposit_Paid: 33253.00
--------------------------------------------------
