diff --git a/_stbt/ocr.py b/_stbt/ocr.py index 8ef9d7c93..3081f5cb7 100644 --- a/_stbt/ocr.py +++ b/_stbt/ocr.py @@ -23,7 +23,8 @@ from .logging import debug, ImageLogger, warn from .types import Region from .utils import ( - named_temporary_directory, native_int, native_str, text_type, to_unicode) + basestring, named_temporary_directory, native_int, native_str, text_type, + to_unicode) # Tesseract sometimes has a hard job distinguishing certain glyphs such as # ligatures and different forms of the same punctuation. We strip out this @@ -150,7 +151,8 @@ def ocr(frame=None, region=Region.ALL, mode=OcrMode.PAGE_SEGMENTATION_WITHOUT_OSD, lang=None, tesseract_config=None, tesseract_user_words=None, tesseract_user_patterns=None, upsample=True, text_color=None, - text_color_threshold=None, engine=None, char_whitelist=None): + text_color_threshold=None, engine=None, char_whitelist=None, + corrections=None): r"""Return the text present in the video frame as a Unicode string. Perform OCR (Optical Character Recognition) using the "Tesseract" @@ -240,10 +242,26 @@ def ocr(frame=None, region=Region.ALL, that tesseract won't think that a zero is the letter o. Note that Tesseract 4.0's LSTM engine ignores ``char_whitelist``. + :param dict corrections: + Dictionary of corrections to replace known OCR mis-reads. Each key of + the dict is the text to search for; the value is the corrected string + to replace the matching key. If the key is a string, it is treated as + plain text and it will only match at word boundaries (for example the + string ``"he saw"`` won't match ``"the saw"`` nor ``"he saws"``). If + the key is a regular expression pattern (created with `re.compile`) it + can match anywhere, and the replacement string can contain + backreferences such as ``"\1"`` which are replaced with the + corresponding group in the pattern (same as Python's `re.sub`). + Example:: + + corrections={'bad': 'good', + re.compile(r'[oO]'): '0'} + | Added in v28: The ``upsample`` and ``text_color`` parameters. | Added in v29: The ``text_color_threshold`` parameter. | Added in v30: The ``engine`` parameter and support for Tesseract v4. | Added in v31: The ``char_whitelist`` parameter. + | Added in v32: The ``corrections`` parameter. """ if frame is None: import stbt @@ -269,6 +287,10 @@ def ocr(frame=None, region=Region.ALL, tesseract_user_patterns, tesseract_user_words, upsample, text_color, text_color_threshold, engine, char_whitelist, imglog) text = text.strip().translate(_ocr_transtab) + + if corrections is not None: + text = apply_ocr_corrections(text, corrections) + debug(u"OCR in region %s read '%s'." % (region, text)) _log_ocr_image_debug(imglog, text) return text @@ -363,6 +385,31 @@ def match_text(text, frame=None, region=Region.ALL, return result +# Python 2.7 & 3.6 have `re._pattern_type` but that will be removed in Python +# 3.7 where they introduce `re.Pattern`. +PatternType = type(re.compile("")) + + +def apply_ocr_corrections(text, corrections): + """Applies the same corrections as `stbt.ocr`'s ``corrections`` parameter. + + This is also available as a separate function, so that you can use it to + post-process old test artifacts using new corrections. + """ + # Match plain strings at word boundaries: + pattern = "|".join(r"\b(" + re.escape(k) + r")\b" + for k in corrections + if isinstance(k, basestring)) + if pattern: + replace = lambda matchobj: corrections[matchobj.group(0)] + text = re.sub(pattern, replace, text) + # Match regexes: + for k, v in corrections.items(): + if isinstance(k, PatternType): + text = re.sub(k, v, text) + return text + + _memoise_tesseract_version = None diff --git a/_stbt/utils.py b/_stbt/utils.py index e785d2a99..36cd19a5a 100644 --- a/_stbt/utils.py +++ b/_stbt/utils.py @@ -96,6 +96,7 @@ def find_import_name(filename): if sys.version_info.major == 2: # Python 2 text_type = unicode # pylint: disable=undefined-variable + basestring = basestring # pylint: disable=redefined-builtin,undefined-variable def strip_newtypes(text): """python-future's string newtypes can behave in surprising ways. We @@ -126,6 +127,7 @@ def check(x, y): check(strip_newtypes(newbytes(b"abc")), b"abc") else: text_type = str + basestring = str def strip_newtypes(text): # newtypes won't be used on Python 3 diff --git a/tests/test_ocr.py b/tests/test_ocr.py index 27810750c..c8d50b9c2 100644 --- a/tests/test_ocr.py +++ b/tests/test_ocr.py @@ -185,6 +185,29 @@ def test_char_whitelist(): char_whitelist="0123456789") +@requires_tesseract +@pytest.mark.parametrize("corrections,expected", [ + # pylint:disable=bad-whitespace + # Default ocr output: + (None, 'OO'), + # Corrections string must match entire word: + ({'O': '0'}, 'OO'), + ({'OO': '00'}, '00'), + # Strings are case-sensitive, and they aren't regexes: + ({'oo': '00', '[oO]': '0'}, 'OO'), + # Regexes do match anywhere: + ({re.compile('[oO]'): '0'}, '00'), + # Make sure it tries all the patterns: + ({'AA': 'BB', 'OO': '00'}, '00'), + ({re.compile('^O'): '1', re.compile('O$'): '2'}, '12'), +]) +def test_corrections(corrections, expected): + f = load_image('ocr/00.png') + print(corrections) + assert expected == stbt.ocr(frame=f, mode=stbt.OcrMode.SINGLE_WORD, + corrections=corrections) + + @requires_tesseract @pytest.mark.parametrize("words", [ pytest.param(None, marks=pytest.mark.xfail),