Skip to content

Commit

Permalink
stbt.ocr: Add corrections parameter
Browse files Browse the repository at this point in the history
A dict of {bad: good} mappings to correct known OCR mistakes.
  • Loading branch information
drothlis committed Apr 6, 2020
1 parent 3ec4b18 commit 041b59d
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 2 deletions.
51 changes: 49 additions & 2 deletions _stbt/ocr.py
Expand Up @@ -23,7 +23,8 @@
from .logging import debug, ImageLogger, warn
from .types import Region
from .utils import (
named_temporary_directory, native_int, native_str, text_type, to_unicode)
basestring, named_temporary_directory, native_int, native_str, text_type,
to_unicode)

# Tesseract sometimes has a hard job distinguishing certain glyphs such as
# ligatures and different forms of the same punctuation. We strip out this
Expand Down Expand Up @@ -150,7 +151,8 @@ def ocr(frame=None, region=Region.ALL,
mode=OcrMode.PAGE_SEGMENTATION_WITHOUT_OSD,
lang=None, tesseract_config=None, tesseract_user_words=None,
tesseract_user_patterns=None, upsample=True, text_color=None,
text_color_threshold=None, engine=None, char_whitelist=None):
text_color_threshold=None, engine=None, char_whitelist=None,
corrections=None):
r"""Return the text present in the video frame as a Unicode string.
Perform OCR (Optical Character Recognition) using the "Tesseract"
Expand Down Expand Up @@ -240,10 +242,26 @@ def ocr(frame=None, region=Region.ALL,
that tesseract won't think that a zero is the letter o.
Note that Tesseract 4.0's LSTM engine ignores ``char_whitelist``.
:param dict corrections:
Dictionary of corrections to replace known OCR mis-reads. Each key of
the dict is the text to search for; the value is the corrected string
to replace the matching key. If the key is a string, it is treated as
plain text and it will only match at word boundaries (for example the
string ``"he saw"`` won't match ``"the saw"`` nor ``"he saws"``). If
the key is a regular expression pattern (created with `re.compile`) it
can match anywhere, and the replacement string can contain
backreferences such as ``"\1"`` which are replaced with the
corresponding group in the pattern (same as Python's `re.sub`).
Example::
corrections={'bad': 'good',
re.compile(r'[oO]'): '0'}
| Added in v28: The ``upsample`` and ``text_color`` parameters.
| Added in v29: The ``text_color_threshold`` parameter.
| Added in v30: The ``engine`` parameter and support for Tesseract v4.
| Added in v31: The ``char_whitelist`` parameter.
| Added in v32: The ``corrections`` parameter.
"""
if frame is None:
import stbt
Expand All @@ -269,6 +287,10 @@ def ocr(frame=None, region=Region.ALL,
tesseract_user_patterns, tesseract_user_words, upsample, text_color,
text_color_threshold, engine, char_whitelist, imglog)
text = text.strip().translate(_ocr_transtab)

if corrections is not None:
text = apply_ocr_corrections(text, corrections)

debug(u"OCR in region %s read '%s'." % (region, text))
_log_ocr_image_debug(imglog, text)
return text
Expand Down Expand Up @@ -363,6 +385,31 @@ def match_text(text, frame=None, region=Region.ALL,
return result


# Python 2.7 & 3.6 have `re._pattern_type` but that will be removed in Python
# 3.7 where they introduce `re.Pattern`.
PatternType = type(re.compile(""))


def apply_ocr_corrections(text, corrections):
"""Applies the same corrections as `stbt.ocr`'s ``corrections`` parameter.
This is also available as a separate function, so that you can use it to
post-process old test artifacts using new corrections.
"""
# Match plain strings at word boundaries:
pattern = "|".join(r"\b(" + re.escape(k) + r")\b"
for k in corrections
if isinstance(k, basestring))
if pattern:
replace = lambda matchobj: corrections[matchobj.group(0)]
text = re.sub(pattern, replace, text)
# Match regexes:
for k, v in corrections.items():
if isinstance(k, PatternType):
text = re.sub(k, v, text)
return text


_memoise_tesseract_version = None


Expand Down
2 changes: 2 additions & 0 deletions _stbt/utils.py
Expand Up @@ -96,6 +96,7 @@ def find_import_name(filename):

if sys.version_info.major == 2: # Python 2
text_type = unicode # pylint: disable=undefined-variable
basestring = basestring # pylint: disable=redefined-builtin,undefined-variable

def strip_newtypes(text):
"""python-future's string newtypes can behave in surprising ways. We
Expand Down Expand Up @@ -126,6 +127,7 @@ def check(x, y):
check(strip_newtypes(newbytes(b"abc")), b"abc")
else:
text_type = str
basestring = str

def strip_newtypes(text):
# newtypes won't be used on Python 3
Expand Down
23 changes: 23 additions & 0 deletions tests/test_ocr.py
Expand Up @@ -185,6 +185,29 @@ def test_char_whitelist():
char_whitelist="0123456789")


@requires_tesseract
@pytest.mark.parametrize("corrections,expected", [
# pylint:disable=bad-whitespace
# Default ocr output:
(None, 'OO'),
# Corrections string must match entire word:
({'O': '0'}, 'OO'),
({'OO': '00'}, '00'),
# Strings are case-sensitive, and they aren't regexes:
({'oo': '00', '[oO]': '0'}, 'OO'),
# Regexes do match anywhere:
({re.compile('[oO]'): '0'}, '00'),
# Make sure it tries all the patterns:
({'AA': 'BB', 'OO': '00'}, '00'),
({re.compile('^O'): '1', re.compile('O$'): '2'}, '12'),
])
def test_corrections(corrections, expected):
f = load_image('ocr/00.png')
print(corrections)
assert expected == stbt.ocr(frame=f, mode=stbt.OcrMode.SINGLE_WORD,
corrections=corrections)


@requires_tesseract
@pytest.mark.parametrize("words", [
pytest.param(None, marks=pytest.mark.xfail),
Expand Down

0 comments on commit 041b59d

Please sign in to comment.