In [50]:
from PIL import Image
import time
import numpy as np
import pandas as pd
import sys
import os

## Data

In [20]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("getomni-ai/ocr-benchmark")

print(type(ds))

<class 'datasets.dataset_dict.DatasetDict'>


In [21]:
data = ds['test']

## Models

In [None]:
ocr_models = {} # {model_name: handler_func}

### apple-ocr

In [52]:
module_path = './models/apple-ocr/src'
sys.path.append(os.path.abspath(module_path))
from apple_ocr import ocr

def apple_ocr_handler(image):
  def parse_apple_ocr_result(data, w, h):
    output = []
    for instance in data:
      bbox = instance[2]
      origin_x_normalized = bbox[0]
      origin_y_normalized = bbox[1]
      width_normalized = bbox[2]
      height_normalized = bbox[3]

      x1 = origin_x_normalized * w
      y1 = origin_y_normalized * h
      x2 = x1 + (width_normalized * w)
      y2 = y1 + (height_normalized * h)
      x1, y1, x2, y2 = np.int32(round(x1)), np.int32(round(y1)), np.int32(round(x2)), np.int32(round(y2))

      output.append({
        'text': instance[0],
        'confidence': instance[1],
        'bbox': [
            [x1, y2], # top left
            [x2, y2], # top right
            [x1, y1], # bottom left
            [x2, y1]  # bottom right
          ]
      })
    return output

  apple_ocr = ocr.OCR(image=image)
  image_width, image_height = image.size

  start_time = time.time()
  df = apple_ocr.recognize()
  end_time = time.time()
  runtime = end_time - start_time

  # Before parsing:
  # [
  #   (
  #     text: str,
  #     confidence: float,
  #     [bbox.origin.x: float, bbox.origin.y: float, bbox.size.width: float, bbox.size.height: float]
  #   ), ...
  # ]
  # Apple:
  # bbox: The coordinates of the bounding box are normalized to the dimensions of the processed image, with the origin at the lower-left corner of the image.
  # confidence: A normalized confidence score for the text recognition result.
  out = apple_ocr.data
  return parse_apple_ocr_result(out, image_width, image_height), runtime

ocr_models['apple-ocr'] = apple_ocr_handler

### Easy-OCR

In [59]:
import easyocr
def easyocr_handler(image):
  def parse_easyocr_result(data):
    output = []
    for instance in data:
      output.append(
        {
          'text': instance[1],
          'confidence': instance[2],
          'bbox': instance[0]
        }
      )
    return output

  image = np.array(image)
  reader = easyocr.Reader(['en'])  # specify language
  # Before parsing:
  # [
  #   (
  #     [[x1, y1], [x2, y2], [x3, y3], [x4, y4]],
  #     text: str, 
  #     confidence: np.float64
  #   ), ...
  # ]
  start_time = time.time()
  results = reader.readtext(image)
  end_time = time.time()
  runtime = end_time - start_time
  return parse_easyocr_result(results), runtime

ocr_models['easy-ocr'] = easyocr_handler

## Run

In [60]:
img = data[99]['image']
for name, handler in ocr_models.items():
  out, runtime = handler(img)
  df = pd.DataFrame(out)
  print(f'=========== {name} ===========\nRuntime: {runtime}')
  print(df['text'].head(10))

Runtime: 0.6084878444671631
0            Staff Shift Schedule
1    Fort Bradlv Medical Center -
2      Week of September 27, 2025
3                        Employee
4                Courtney Lebsack
5                     Linda Lesch
6                   Roberto Stehr
7                Horace Gleichner
8                    Stella Fadel
9                Jared Leannon II
Name: text, dtype: object
Runtime: 4.683197975158691
0          Staff Shift Schedule
1                          Fort
2         Bradly Medical Center
3    Week of September 27, 2025
4                      Employee
5                            ID
6                    Department
7                        Sat 27
8                        Sun 28
9                        Mon 29
Name: text, dtype: object
