stbt.ocr: Add corrections parameter

A dict of {bad: good} mappings to correct known OCR mistakes.
stb-tester · Apr 6, 2020 · 041b59d · 041b59d
1 parent 3ec4b18
commit 041b59d
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 2 deletions.
diff --git a/_stbt/ocr.py b/_stbt/ocr.py
@@ -23,7 +23,8 @@
 from .logging import debug, ImageLogger, warn
 from .types import Region
 from .utils import (
-    named_temporary_directory, native_int, native_str, text_type, to_unicode)
+    basestring, named_temporary_directory, native_int, native_str, text_type,
+    to_unicode)
 
 # Tesseract sometimes has a hard job distinguishing certain glyphs such as
 # ligatures and different forms of the same punctuation.  We strip out this
@@ -150,7 +151,8 @@ def ocr(frame=None, region=Region.ALL,
         mode=OcrMode.PAGE_SEGMENTATION_WITHOUT_OSD,
         lang=None, tesseract_config=None, tesseract_user_words=None,
         tesseract_user_patterns=None, upsample=True, text_color=None,
-        text_color_threshold=None, engine=None, char_whitelist=None):
+        text_color_threshold=None, engine=None, char_whitelist=None,
+        corrections=None):
     r"""Return the text present in the video frame as a Unicode string.
 
     Perform OCR (Optical Character Recognition) using the "Tesseract"
@@ -240,10 +242,26 @@ def ocr(frame=None, region=Region.ALL,
         that tesseract won't think that a zero is the letter o.
         Note that Tesseract 4.0's LSTM engine ignores ``char_whitelist``.
 
+    :param dict corrections:
+        Dictionary of corrections to replace known OCR mis-reads. Each key of
+        the dict is the text to search for; the value is the corrected string
+        to replace the matching key. If the key is a string, it is treated as
+        plain text and it will only match at word boundaries (for example the
+        string ``"he saw"`` won't match ``"the saw"`` nor ``"he saws"``). If
+        the key is a regular expression pattern (created with `re.compile`) it
+        can match anywhere, and the replacement string can contain
+        backreferences such as ``"\1"`` which are replaced with the
+        corresponding group in the pattern (same as Python's `re.sub`).
+        Example::
+
+            corrections={'bad': 'good',
+                         re.compile(r'[oO]'): '0'}
+
     | Added in v28: The ``upsample`` and ``text_color`` parameters.
     | Added in v29: The ``text_color_threshold`` parameter.
     | Added in v30: The ``engine`` parameter and support for Tesseract v4.
     | Added in v31: The ``char_whitelist`` parameter.
+    | Added in v32: The ``corrections`` parameter.
     """
     if frame is None:
         import stbt
@@ -269,6 +287,10 @@ def ocr(frame=None, region=Region.ALL,
         tesseract_user_patterns, tesseract_user_words, upsample, text_color,
         text_color_threshold, engine, char_whitelist, imglog)
     text = text.strip().translate(_ocr_transtab)
+
+    if corrections is not None:
+        text = apply_ocr_corrections(text, corrections)
+
     debug(u"OCR in region %s read '%s'." % (region, text))
     _log_ocr_image_debug(imglog, text)
     return text
@@ -363,6 +385,31 @@ def match_text(text, frame=None, region=Region.ALL,
     return result
 
 
+# Python 2.7 & 3.6 have `re._pattern_type` but that will be removed in Python
+# 3.7 where they introduce `re.Pattern`.
+PatternType = type(re.compile(""))
+
+
+def apply_ocr_corrections(text, corrections):
+    """Applies the same corrections as `stbt.ocr`'s ``corrections`` parameter.
+
+    This is also available as a separate function, so that you can use it to
+    post-process old test artifacts using new corrections.
+    """
+    # Match plain strings at word boundaries:
+    pattern = "|".join(r"\b(" + re.escape(k) + r")\b"
+                       for k in corrections
+                       if isinstance(k, basestring))
+    if pattern:
+        replace = lambda matchobj: corrections[matchobj.group(0)]
+        text = re.sub(pattern, replace, text)
+    # Match regexes:
+    for k, v in corrections.items():
+        if isinstance(k, PatternType):
+            text = re.sub(k, v, text)
+    return text
+
+
 _memoise_tesseract_version = None
 
 

diff --git a/_stbt/utils.py b/_stbt/utils.py
@@ -96,6 +96,7 @@ def find_import_name(filename):
 
 if sys.version_info.major == 2:  # Python 2
     text_type = unicode  # pylint: disable=undefined-variable
+    basestring = basestring  # pylint: disable=redefined-builtin,undefined-variable
 
     def strip_newtypes(text):
         """python-future's string newtypes can behave in surprising ways.  We
@@ -126,6 +127,7 @@ def check(x, y):
         check(strip_newtypes(newbytes(b"abc")), b"abc")
 else:
     text_type = str
+    basestring = str
 
     def strip_newtypes(text):
         # newtypes won't be used on Python 3

diff --git a/tests/test_ocr.py b/tests/test_ocr.py
@@ -185,6 +185,29 @@ def test_char_whitelist():
         char_whitelist="0123456789")
 
 
+@requires_tesseract
+@pytest.mark.parametrize("corrections,expected", [
+    # pylint:disable=bad-whitespace
+    # Default ocr output:
+    (None,                                           'OO'),
+    # Corrections string must match entire word:
+    ({'O': '0'},                                     'OO'),
+    ({'OO': '00'},                                   '00'),
+    # Strings are case-sensitive, and they aren't regexes:
+    ({'oo': '00', '[oO]': '0'},                      'OO'),
+    # Regexes do match anywhere:
+    ({re.compile('[oO]'): '0'},                      '00'),
+    # Make sure it tries all the patterns:
+    ({'AA': 'BB', 'OO': '00'},                       '00'),
+    ({re.compile('^O'): '1', re.compile('O$'): '2'}, '12'),
+])
+def test_corrections(corrections, expected):
+    f = load_image('ocr/00.png')
+    print(corrections)
+    assert expected == stbt.ocr(frame=f, mode=stbt.OcrMode.SINGLE_WORD,
+                                corrections=corrections)
+
+
 @requires_tesseract
 @pytest.mark.parametrize("words", [
     pytest.param(None, marks=pytest.mark.xfail),