In [1]:
from PIL import Image
import time
import numpy as np
import pandas as pd
import sys
import os

## Data

In [2]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("getomni-ai/ocr-benchmark")

print(type(ds))

  from .autonotebook import tqdm as notebook_tqdm


<class 'datasets.dataset_dict.DatasetDict'>


In [3]:
data = ds['test']

## Models

In [4]:
ocr_models = {} # {model_name: handler_func}

### apple-ocr

Reformatted the output for benchmark consistency.
```
Before parsing:
[
  (
    text: str,
    confidence: float,
    [bbox.origin.x: float, bbox.origin.y: float, bbox.size.width: float, bbox.size.height: float]
  ), ...
]
Apple:
bbox: The coordinates of the bounding box are normalized to the dimensions of the processed image, with the origin at the lower-left corner of the image.
confidence: A normalized confidence score for the text recognition result.
```

In [None]:
module_path = './models/apple-ocr/src'
sys.path.append(os.path.abspath(module_path))
from apple_ocr import ocr

def apple_ocr_handler(image):
  def parse_apple_ocr_result(data, w, h):
    output = []
    for instance in data:
      bbox = instance[2]
      origin_x_normalized = bbox[0]
      origin_y_normalized = bbox[1]
      width_normalized = bbox[2]
      height_normalized = bbox[3]

      x1 = origin_x_normalized * w
      y1 = origin_y_normalized * h
      x2 = x1 + (width_normalized * w)
      y2 = y1 + (height_normalized * h)
      x1, y1, x2, y2 = np.int32(round(x1)), np.int32(round(y1)), np.int32(round(x2)), np.int32(round(y2))

      output.append({
        'text': instance[0],
        'confidence': instance[1],
        'bbox': [
            [x1, y2], # top left
            [x2, y2], # top right
            [x1, y1], # bottom left
            [x2, y1]  # bottom right
          ]
      })
    return output

  start_time = time.time()
  apple_ocr = ocr.OCR(image=image)
  end_time = time.time()
  setup_time = end_time - start_time

  image_width, image_height = image.size

  start_time = time.time()
  df = apple_ocr.recognize()
  end_time = time.time()
  runtime = end_time - start_time

  # Before parsing:
  # [
  #   (
  #     text: str,
  #     confidence: float,
  #     [bbox.origin.x: float, bbox.origin.y: float, bbox.size.width: float, bbox.size.height: float]
  #   ), ...
  # ]
  # Apple:
  # bbox: The coordinates of the bounding box are normalized to the dimensions of the processed image, with the origin at the lower-left corner of the image.
  # confidence: A normalized confidence score for the text recognition result.
  out = apple_ocr.data
  return parse_apple_ocr_result(out, image_width, image_height), runtime, setup_time

ocr_models['apple-ocr'] = apple_ocr_handler

### easy-ocr

In [6]:
import easyocr
def easyocr_handler(image):
  def parse_easyocr_result(data):
    output = []
    for instance in data:
      output.append(
        {
          'text': instance[1],
          'confidence': instance[2],
          'bbox': instance[0]
        }
      )
    return output

  image = np.array(image)

  start_time = time.time()
  reader = easyocr.Reader(['en'])  # specify language
  end_time = time.time()
  setup_time = end_time - start_time
  # Before parsing:
  # [
  #   (
  #     [[x1, y1], [x2, y2], [x3, y3], [x4, y4]],
  #     text: str, 
  #     confidence: np.float64
  #   ), ...
  # ]
  start_time = time.time()
  results = reader.readtext(image)
  end_time = time.time()
  runtime = end_time - start_time
  return parse_easyocr_result(results), runtime, setup_time

ocr_models['easy-ocr'] = easyocr_handler

### tesseract-ocr

Note: On MacOS, the `pytesseract` python package is just a wrapper. Need to also `brew install tesseract`.

In [14]:
import pytesseract
def tesseract_ocr_handler(image):
  # out = pd.DataFrame(pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT))
  out = pytesseract.image_to_string(image)
  return out

In [15]:
img = data[99]['image']
tesseract_ocr_handler(img)

'Staff Shift Schedule\n\nFort Bradly Medical Center -\nWeek of September 27, 2025\n\nEmployee ID\nCourtney Lebsack DZF5Ro\nLinda Lesch CUE1PU\nRoberto Stehr ZPNPLZ\nHorace Gleichner XIQ2UM\nStella Fadel 5PP4T6\nJared Leannon II 9GK3TW\nDonnie Simonis II IMPSCT\nMindy Kunze F7UIXW\nMs. Terri Ziemann VGNNYO\nTabitha Haag GJCWYL\nTina Borer LWO4MG\nDarin Hyatt PA1WIL\nBrittany Goyette ZZRYIL\nVictor Beatty QGWTXC\nJeremy Labadie CFFSHV\nMs. Gina Bahringer H4RTIU\nDwayne Runolfsson MF5DOB\nLola Mills ESIY7C\nBrett Considine YLZBTD\nSean Boyle YSAYPJ\n\nDepartment\n\nSurgery\n\nICU\n\nICU\n\nEmergency\n\nPediatrics\n\nRadiology\n\nNeurology\n\nRadiology\n\nRadiology\n\nCardiology\n\nCardiology\n\nEmergency\n\nRadiology\n\nSurgery\n\nCardiology\n\nEmergency\n\nNeurology\n\nEmergency\n\nOncology\n\nOncology\n\nSat 27 Sun 28\n. Afternoon.\nLeave Sick Leave\n15:00-23:00\nAfternoon\n15:00-23:00\nAfternoon Morning\n15:00-23:00 07:00-15:00\nAfternoon\nLeave Personal\n15:00-23:00\nMorning\n07:00-15

## Run

Note this is run a with single image, different models may have varying performances with batched processing.

In [16]:
img = data[99]['image']
for name, handler in ocr_models.items():
  print(f'=========== {name} ===========')
  out, runtime, setup_time = handler(img)
  df = pd.DataFrame(out)
  print(f'Setup Time: {setup_time}\nRuntime: {runtime}')
  print("---- Output ----")
  print(df['text'].head(10))
  print(f'')

Setup Time: 3.314018249511719e-05
Runtime: 0.6097888946533203
---- Output ----
0            Staff Shift Schedule
1    Fort Bradlv Medical Center -
2      Week of September 27, 2025
3                        Employee
4                Courtney Lebsack
5                     Linda Lesch
6                   Roberto Stehr
7                Horace Gleichner
8                    Stella Fadel
9                Jared Leannon II
Name: text, dtype: object

Setup Time: 2.3394429683685303
Runtime: 6.487469911575317
---- Output ----
0          Staff Shift Schedule
1                          Fort
2         Bradly Medical Center
3    Week of September 27, 2025
4                      Employee
5                            ID
6                    Department
7                        Sat 27
8                        Sun 28
9                        Mon 29
Name: text, dtype: object

