In [1]:
!sudo apt install tesseract-ocr
!pip install pytesseract

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 2s (2,685 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

In [2]:
!kaggle datasets download -d trainingdatapro/ocr-receipts-text-detection

Dataset URL: https://www.kaggle.com/datasets/trainingdatapro/ocr-receipts-text-detection
License(s): Attribution-NonCommercial-NoDerivatives 4.0 International (CC BY-NC-ND 4.0)
Downloading ocr-receipts-text-detection.zip to /content
 78% 41.0M/52.6M [00:00<00:00, 70.7MB/s]
100% 52.6M/52.6M [00:00<00:00, 71.0MB/s]


In [3]:
!unzip '/content/ocr-receipts-text-detection.zip'

Archive:  /content/ocr-receipts-text-detection.zip
  inflating: annotations.xml         
  inflating: boxes/0.png             
  inflating: boxes/1.png             
  inflating: boxes/10.png            
  inflating: boxes/11.png            
  inflating: boxes/12.png            
  inflating: boxes/13.png            
  inflating: boxes/14.png            
  inflating: boxes/15.png            
  inflating: boxes/16.png            
  inflating: boxes/17.png            
  inflating: boxes/18.png            
  inflating: boxes/19.png            
  inflating: boxes/2.png             
  inflating: boxes/3.png             
  inflating: boxes/4.png             
  inflating: boxes/5.png             
  inflating: boxes/6.png             
  inflating: boxes/7.png             
  inflating: boxes/8.png             
  inflating: boxes/9.png             
  inflating: images/0.jpg            
  inflating: images/1.jpg            
  inflating: images/10.jpg           
  inflating: images/11.jpg           

In [4]:
import os
import cv2
import numpy as np
import pandas as pd
import pytesseract
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [5]:
class ReceiptOCRExtractor:
    def __init__(self):
        """
        Initialize the Receipt OCR Extractor
        """
        self.model = None
        self.scaler = None

    def preprocess_image(self, image_path):
        """
        Preprocess the receipt image for better OCR accuracy

        Args:
            image_path (str): Path to the input image

        Returns:
            numpy.ndarray: Preprocessed image
        """
        # Read the image
        image = cv2.imread(image_path)

        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Apply thresholding to preprocess the image
        gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

        # Apply denoising
        gray = cv2.medianBlur(gray, 3)

        return gray

    def extract_text(self, image):
        """
        Extract text from preprocessed image using Tesseract

        Args:
            image (numpy.ndarray): Preprocessed image

        Returns:
            str: Extracted text
        """
        # Extract text from image
        text = pytesseract.image_to_string(image)
        return text

    def parse_receipt_text(self, text):
        """
        Parse extracted text to extract meaningful information

        Args:
            text (str): Extracted text from receipt

        Returns:
            dict: Extracted receipt information
        """
        # Initialize default values
        receipt_info = {
            'total': None,
            'date': None,
            'store_name': None,
            'items': [],
            'tax': None
        }

        # Regular expressions for extraction
        total_pattern = r'\b(?:total|grand total)[:]*\s*\$?(\d+\.\d{2})\b'
        date_pattern = r'\b(\d{1,2}/\d{1,2}/\d{2,4})\b'

        # Extract total
        total_match = re.search(total_pattern, text, re.IGNORECASE)
        if total_match:
            receipt_info['total'] = float(total_match.group(1))

        # Extract date
        date_match = re.search(date_pattern, text)
        if date_match:
            receipt_info['date'] = date_match.group(1)

        # Extract store name (first line of text)
        lines = text.split('\n')
        if lines:
            receipt_info['store_name'] = lines[0].strip()

        # Extract items (simple parsing)
        item_pattern = r'^(.+)\s+\$?(\d+\.\d{2})$'
        for line in lines[1:]:
            item_match = re.match(item_pattern, line.strip())
            if item_match:
                receipt_info['items'].append({
                    'name': item_match.group(1).strip(),
                    'price': float(item_match.group(2))
                })

        # Extract tax
        tax_pattern = r'\b(?:tax)[:]*\s*\$?(\d+\.\d{2})\b'
        tax_match = re.search(tax_pattern, text, re.IGNORECASE)
        if tax_match:
            receipt_info['tax'] = float(tax_match.group(1))

        return receipt_info

    def train_classification_model(self, receipt_dataset):
        """
        Train a classification model to improve receipt type detection

        Args:
            receipt_dataset (list): List of receipt features

        Returns:
            tuple: Trained model and scaler
        """
        # Prepare features and labels
        X = [receipt['features'] for receipt in receipt_dataset]
        y = [receipt['label'] for receipt in receipt_dataset]

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Scale features
        self.scaler = StandardScaler()
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        # Train Random Forest Classifier
        self.model = RandomForestClassifier(n_estimators=100, random_state=42)
        self.model.fit(X_train_scaled, y_train)

        # Evaluate the model
        y_pred = self.model.predict(X_test_scaled)
        print("Model Accuracy:", accuracy_score(y_test, y_pred))
        print("\nClassification Report:\n", classification_report(y_test, y_pred))

        return self.model, self.scaler

    def process_receipt(self, image_path):
        """
        Process a single receipt image

        Args:
            image_path (str): Path to receipt image

        Returns:
            dict: Extracted receipt information
        """
        # Preprocess image
        preprocessed_image = self.preprocess_image(image_path)

        # Extract text
        extracted_text = self.extract_text(preprocessed_image)

        # Parse receipt text
        receipt_info = self.parse_receipt_text(extracted_text)

        return receipt_info

    def process_receipt_batch(self, image_folder, output_csv='receipts_output.csv'):
        """
        Process multiple receipt images in a batch

        Args:
            image_folder (str): Folder containing receipt images
            output_csv (str): Path to output CSV file

        Returns:
            pandas.DataFrame: DataFrame with extracted receipt information
        """
        receipts_data = []

        # Process each image in the folder
        for filename in os.listdir(image_folder):
            if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff')):
                image_path = os.path.join(image_folder, filename)
                try:
                    receipt_info = self.process_receipt(image_path)
                    receipt_info['filename'] = filename
                    receipts_data.append(receipt_info)
                except Exception as e:
                    print(f"Error processing {filename}: {e}")

        # Convert to DataFrame
        df = pd.DataFrame(receipts_data)

        # Save to CSV
        df.to_csv(output_csv, index=False)
        print(f"Receipts data saved to {output_csv}")

        return df

    def save_model(self, model_path='receipt_ocr_model.joblib'):
        """
        Save trained model and scaler

        Args:
            model_path (str): Path to save the model
        """
        if self.model and self.scaler:
            joblib.dump({
                'model': self.model,
                'scaler': self.scaler
            }, model_path)
            print(f"Model and scaler saved to {model_path}")
        else:
            print("No model to save. Train the model first.")

In [6]:
# Example usage
def main():
    # Initialize the extractor
    extractor = ReceiptOCRExtractor()

    # Process a batch of receipts
    results_df = extractor.process_receipt_batch('/content/images')

    # Optional: Train a classification model if you have a labeled dataset
    # receipt_dataset = [
    #     {'features': [...], 'label': 'grocery'},
    #     {'features': [...], 'label': 'restaurant'}
    # ]
    # model, scaler = extractor.train_classification_model(receipt_dataset)
    # extractor.save_model()

if __name__ == "__main__":
    main()

Receipts data saved to receipts_output.csv
