
<font size="10">**Extracting Date and Total from Receipts Using pytesseract and easy_ocr**</font>


<font size="6">
The images are first imported, then for both pytesseract and easy_ocr, the following steps are followed:
</font>

<font size="6">
   
1. Obtain all the text from the images 
1. Extract the date using regex
1. Extract the total price using regex
1. Create a pandas dataframe with the image name, raw text extracted, date and total
</font>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
from PIL import Image
import pytesseract
import matplotlib.pyplot as plt
import glob
import os 
import re

print(os.getcwd())

def Display_Image(image, title = 'an image'):
    plt.imshow(image)
    plt.title(title)
    plt.show()
    plt.clf()


In [None]:

'''
Import the data
'''

print(os.getcwd())
list_images = glob.glob(r"/kaggle/input/ocr-receipts-text-detection/images/*.jpg")
print(len(list_images))

images = np.array([np.array(Image.open(fname))\
                   for fname in list_images], dtype = "object")
print(images.shape)
print(images[0].shape)

# Let's display a handfull of images
for i in range(len(images)):
    if i%3 == 0:
        Display_Image(images[i], title = 'image #' + str(i))
# As we can see, some images are of better quality than others

In [None]:
'''
Text extraction with pytesseract and data processing, Part I
'''
# Feed each image into pytesseract to extract text,
# then extract the date and the total amount of each receipt

# Simple test
print(pytesseract.image_to_string(images[3], lang = 'eng'))
# looking good!

# Full run, takes a few minutes
raw_text = [pytesseract.image_to_string(images[i], lang = 'eng') for i in range(len(images))]



In [None]:
[print(len(raw_text[i])) for i in range(len(raw_text))]
# Not a lot has been extracted from the 2nd and 3rd images, 
# Let's have a closer look

Display_Image(images[1], title = 'image #1')
Display_Image(images[2], title = 'image #2')
# That is fair that the algorithum is struggling, those are saturated and blurry pictures

In [None]:
'''
Text extraction with pytesseract and data processing, Part II
'''
# Extract the date and total from text pytesseract extracted from the images

# Let's extract the dates format from raw text
# Trader joe's date format: mm-dd-yyyy
# Walmart date format: mm/dd/yy
# WholeFood: mm/dd/yyyy
# Spar: dd.mm.yy
# WinCo Food: mm/dd/yy
# Costco: mm/dd/yyyy
# MOMI & TOY's: dd/mm/yyyy
#
# a few format stands out:
# number: #
#   ##-##-#### regex: \d\d+-+\d\d+-+\d\d\d\d
#   ##/##/#### regex: 
#   ##.##.##
#   ##/##/##
#
# regex: \d\d+[-/.]+\d\d+[-/.]+\d\d\d\d
# regex: \d\d+[-/.]+\d\d+[-/.]+\d\d\D
# Those 2 functions should conver all the format above!
#
# I ended up creating a regex function that combines the 2 above,
# Final regex: \d{2}[.\/-]\d{2}[.\/-]\d{2,4}

# Function to search for dates within the raw_text
#
def Date_Extraction(text):
    temp = re.findall("\d{2}[.\/-]\d{2}[.\/-]\d{2,4}", text)
    temp = pd.Series(temp)
    temp = list(temp.unique())
    
    return temp
    

Date_Extraction(raw_text[0]) 

date = [Date_Extraction(text) for text in raw_text]
print(date)


# Function to search for total price 
#

def Total_Extraction(text):
    try:
        test = pd.Series(re.split("\n",text ))
        value = test[test.str.find('TOTAL')==0].reset_index(drop = True)    
        num = float(re.sub("[^\d.]", "", value[0]))
        return num
    except:
        return None

total = [Total_Extraction(text) for text in raw_text]

print(total)

In [None]:
'''
Text extraction with pytesseract and data processing Part III
'''
# Create a data frame with all the information gathered

# Get the image's names
images_name = [list_images[i].split('/')[-1] for i in range(len(images))]

data_extraction_pytesseract = pd.DataFrame( {
                                'image': images_name
                                ,'raw_text': raw_text
                                ,'date': date
                                ,'total': total
                                })
data_extraction_pytesseract

In [None]:
'''
Text extraction with easy_ocr and data processing, Part I
'''

import easyocr

reader = easyocr.Reader(['en'], gpu = False)

Display_Image(images[0])

In [None]:
'''
Text extraction with easy_ocr and data processing, Part II
'''
text = reader.readtext(images[0])
print(text)
# The format is quite different that with pytesseract!

all_text = [reader.readtext(images[im]) for im in range(len(images))]

In [None]:
'''
Text extraction with easy_ocr and data processing, Part III
'''
def Easyocr_Total_Extraction(text):
    # Isolate teh text, exclude the box and the accuracy values
    sub_text = pd.DataFrame(text)[1]
    
    # Turn everything into a single string
    sub_text = ' '.join(sub_text)
    sub_text = sub_text.upper()

    # Find the find the set of number and some characters following teh word 'TOTAL'
    # while excluding 'SUBTOTAL'
    amount = re.findall(r"((?<=TOTAL\s)[$,.0-9]+)", sub_text)
    
    try:
        # get rid of the dollard sign and ouput a float
        amount = re.findall(r"([0-9.,]+)", amount[0])
        return float(amount[0])
    except:
        return None

print(len(all_text[0]))


totals = [Easyocr_Total_Extraction(all_text[im]) for im in range(len(all_text))]
print(totals)

In [None]:
'''
Text extraction with easy_ocr and data processing, Part IV
'''
# Let's extract the date

def Easyocr_Date_Extraction(text):
    sub_text = pd.DataFrame(text)[1]

    sub_text = ' '.join(sub_text)
    sub_text = sub_text.upper()

    date = re.findall(r"(\d{2}[.\/-]\d{2}[.\/-]\d{2,4})", sub_text)
    
    try:
        return date[0]
    except:
        return None

dates = [(Easyocr_Date_Extraction(all_text[im])) for im in range(len(all_text))]
print(dates)

In [None]:
'''
Text extraction with easy_ocr and data processing, Part V
'''
data_extraction_easyocr = pd.DataFrame( {
                                'image': images_name
                                ,'raw_text': all_text
                                ,'date': dates
                                ,'total': totals
                                })
data_extraction_easyocr

MY CODE


In [1]:
!pip install easyocr
!apt install tesseract-ocr


Reading package lists... Done
Building dependency tree       
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2build2).
0 upgraded, 0 newly installed, 0 to remove and 54 not upgraded.


In [2]:
import numpy as np
import pandas as pd
from PIL import Image, ImageEnhance
import pytesseract
import easyocr
import matplotlib.pyplot as plt
import glob
import os 
import re
import cv2
import json
from scipy import ndimage


In [3]:
def preprocess_image(image):
    if isinstance(image, Image.Image):
        image = np.array(image)
    
    if len(image.shape) == 3:
        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    else:
        gray = image.copy()
    
    gray_corrected = correct_skew(gray)
    enhanced = cv2.equalizeHist(gray_corrected)
    denoised = cv2.medianBlur(enhanced, 3)
    kernel = np.ones((1,1), np.uint8)
    cleaned = cv2.morphologyEx(denoised, cv2.MORPH_CLOSE, kernel)
    
    return cleaned

def correct_skew(image):
    try:
        edges = cv2.Canny(image, 50, 150, apertureSize=3)
        lines = cv2.HoughLines(edges, 1, np.pi/180, threshold=100)
        
        if lines is not None:
            angles = []
            for rho, theta in lines[:10]:
                angle = theta * 180 / np.pi
                if angle > 45:
                    angle = angle - 90
                angles.append(angle)
            
            median_angle = np.median(angles)
            if abs(median_angle) > 0.5:
                rotated = ndimage.rotate(image, median_angle, reshape=False, cval=255)
                return rotated
        
        return image
    except:
        return image


In [7]:
def detect_printed_area(image):
    try:
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        else:
            gray = image.copy()
        
        _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        if contours:
            largest_contour = max(contours, key=cv2.contourArea)
            x, y, w, h = cv2.boundingRect(largest_contour)
            padding = 20
            x = max(0, x - padding)
            y = max(0, y - padding)
            w = min(gray.shape[1] - x, w + 2 * padding)
            h = min(gray.shape[0] - y, h + 2 * padding)
            cropped = image[y:y+h, x:x+w]
            return cropped
        
        return image
    except:
        return image


In [8]:
def extract_merchant_name_pytesseract(text):
    lines = text.strip().split('\n')
    merchant_keywords = [
        'TRADER JOE', 'WALMART', 'WHOLE FOODS', 'COSTCO', 'SAFEWAY', 'KROGER',
        'TARGET', 'CVS', 'WALGREENS', 'MCDONALD', 'STARBUCKS', 'SUBWAY',
        'SPAR', 'WINCO', 'MOMI', 'TOY', 'STORE', 'MARKET', 'SHOP'
    ]
    for i, line in enumerate(lines[:5]):
        line_upper = line.upper().strip()
        if len(line_upper) < 3 or re.search(r'^\d+[\d\s\-/]*$', line_upper):
            continue
        for keyword in merchant_keywords:
            if keyword in line_upper:
                cleaned = re.sub(r'[^\w\s&\'-]', ' ', line_upper)
                return ' '.join(cleaned.split())
        if i == 0 and len(line_upper) > 3:
            cleaned = re.sub(r'[^\w\s&\'-]', ' ', line_upper)
            return ' '.join(cleaned.split())
    return None

def extract_merchant_name_easyocr(text_data):
    if not text_data:
        return None
    texts = [item[1] for item in text_data]
    merchant_keywords = [
        'TRADER JOE', 'WALMART', 'WHOLE FOODS', 'COSTCO', 'SAFEWAY', 'KROGER',
        'TARGET', 'CVS', 'WALGREENS', 'MCDONALD', 'STARBUCKS', 'SUBWAY',
        'SPAR', 'WINCO', 'MOMI', 'TOY', 'STORE', 'MARKET', 'SHOP'
    ]
    for i, text in enumerate(texts[:5]):
        text_upper = text.upper().strip()
        if len(text_upper) < 3 or re.search(r'^\d+[\d\s\-/]*$', text_upper):
            continue
        for keyword in merchant_keywords:
            if keyword in text_upper:
                cleaned = re.sub(r'[^\w\s&\'-]', ' ', text_upper)
                return ' '.join(cleaned.split())
        if i == 0 and len(text_upper) > 3:
            cleaned = re.sub(r'[^\w\s&\'-]', ' ', text_upper)
            return ' '.join(cleaned.split())
    return None


In [9]:
def extract_total_pytesseract(text):
    try:
        lines = text.split('\n')
        for line in lines:
            line_upper = line.upper().strip()
            if line_upper.startswith('TOTAL'):
                numbers = re.findall(r'\d+\.?\d*', line)
                if numbers:
                    amounts = [float(num) for num in numbers if '.' in num or len(num) >= 2]
                    if amounts:
                        return max(amounts)
        text_upper = text.upper()
        total_matches = re.findall(r'TOTAL[^\d]*(\d+\.?\d{2})', text_upper)
        if total_matches:
            return float(total_matches[-1])
        total_pattern = re.search(r'TOTAL[^\d]*\$?(\d+\.\d{2})', text_upper)
        if total_pattern:
            return float(total_pattern.group(1))
        all_amounts = re.findall(r'\$?(\d+\.\d{2})', text)
        if all_amounts:
            return max([float(a) for a in all_amounts])
        return None
    except:
        return None

def extract_total_easyocr(text_data):
    try:
        if not text_data:
            return None
        texts = [item[1] for item in text_data]
        combined_text = ' '.join(texts).upper()
        total_pattern = re.search(r'TOTAL[^\d]*\$?(\d+\.?\d{2})', combined_text)
        if total_pattern:
            return float(total_pattern.group(1))
        for text in texts:
            text_upper = text.upper().strip()
            if 'TOTAL' in text_upper:
                numbers = re.findall(r'\d+\.\d{2}', text_upper)
                if numbers:
                    return float(numbers[-1])
        all_amounts = re.findall(r'\$?(\d+\.\d{2})', combined_text)
        if all_amounts:
            return max([float(a) for a in all_amounts])
        return None
    except:
        return None


In [12]:
def process_receipts_pipeline():
    list_images = glob.glob(r"/kaggle/input/ocr-receipts-text-detection/images/*.jpg")
    print(f"Found {len(list_images)} images to process")
    
    reader = easyocr.Reader(['en'], gpu=False)
    results = []

    for i, image_path in enumerate(list_images):
        print(f"Processing image {i+1}/{len(list_images)}: {os.path.basename(image_path)}")
        try:
            image = np.array(Image.open(image_path))
            cropped_image = detect_printed_area(image)
            processed_image = preprocess_image(cropped_image)

            pytess_text = pytesseract.image_to_string(processed_image, lang='eng')
            pytess_merchant = extract_merchant_name_pytesseract(pytess_text)
            pytess_total = extract_total_pytesseract(pytess_text)

            easyocr_result = reader.readtext(processed_image)
            easyocr_merchant = extract_merchant_name_easyocr(easyocr_result)
            easyocr_total = extract_total_easyocr(easyocr_result)

            if pytess_merchant and pytess_total:
                merchant_name = pytess_merchant
                total_amount = pytess_total
                ocr_engine = "pytesseract"
            elif easyocr_merchant and easyocr_total:
                merchant_name = easyocr_merchant
                total_amount = easyocr_total
                ocr_engine = "easyocr"
            else:
                merchant_name = pytess_merchant or easyocr_merchant
                total_amount = pytess_total or easyocr_total
                ocr_engine = "hybrid"

            results.append({
                "image": os.path.basename(image_path),
                "merchant_name": merchant_name,
                "total_amount": total_amount,
                "ocr_engine_used": ocr_engine
            })

            print(f"  Merchant: {merchant_name}")
            print(f"  Total: ${total_amount}")
            print()
        except Exception as e:
            print(f"Error processing {image_path}: {str(e)}")
            results.append({
                "image": os.path.basename(image_path),
                "merchant_name": None,
                "total_amount": None,
                "ocr_engine_used": "error"
            })
    
    return results


In [13]:
def save_results_to_json(results, filename="ocr_results.json"):
    json_results = [{
        "image": r["image"],
        "merchant_name": r["merchant_name"],
        "total_amount": r["total_amount"]
    } for r in results]
    
    with open(filename, 'w') as f:
        json.dump(json_results, f, indent=2)
    
    print(f"Results saved to {filename}")
    return json_results


In [14]:
results = process_receipts_pipeline()
json_results = save_results_to_json(results)

df = pd.DataFrame(results)
print("\nResults Summary:")
print(df[['image', 'merchant_name', 'total_amount', 'ocr_engine_used']])

successful_merchant = df['merchant_name'].notna().sum()
successful_total = df['total_amount'].notna().sum()
total_images = len(df)

print(f"\nSuccess Rates:")
print(f"Merchant Name: {successful_merchant}/{total_images} ({successful_merchant/total_images*100:.1f}%)")
print(f"Total Amount: {successful_total}/{total_images} ({successful_total/total_images*100:.1f}%)")
print(f"Both Fields: {df.dropna(subset=['merchant_name', 'total_amount']).shape[0]}/{total_images}")


Found 19 images to process
Processing image 1/19: 5.jpg
  Merchant: WHOLE FOODS
  Total: $28.28

Processing image 2/19: 8.jpg
  Merchant: None
  Total: $None

Processing image 3/19: 10.jpg
  Merchant: BEER 036-4481240
  Total: $15.99

Processing image 4/19: 0.jpg
  Merchant: WAL MART '
  Total: $8348.64

Processing image 5/19: 9.jpg
  Merchant: WINCO
  Total: $121.92

Processing image 6/19: 1.jpg
  Merchant: 2001 GREENYI1IE AVE
  Total: $200.69

Processing image 7/19: 16.jpg
  Merchant: EFT DEBIT PAY FROM PRIMARY
  Total: $2696.0

Processing image 8/19: 7.jpg
  Merchant: EFT DEBIT PAY FROM PRIMARY
  Total: $2696.0

Processing image 9/19: 13.jpg
  Merchant: ID A 1SWOVEXCW
  Total: $10.0

Processing image 10/19: 17.jpg
  Merchant: WP RSA
  Total: $None

Processing image 11/19: 15.jpg
  Merchant: WALMART
  Total: $70.63

Processing image 12/19: 12.jpg
  Merchant: WALAMART
  Total: $13.48

Processing image 13/19: 11.jpg
  Merchant: WHOLE
  Total: $2.15

Processing image 14/19: 4.jpg
  Merc