# See if bad reconstructions can be improved

In [1]:
from collections import defaultdict, namedtuple
from types import SimpleNamespace

from IPython.display import display, HTML

from digi_leap.pylib import consts
from digi_leap.pylib.db import db
from digi_leap.pylib.label_builder.line_align import char_sub_matrix as subs
from digi_leap.pylib.label_builder.line_align import line_align_py  # noqa
from digi_leap.pylib.ocr import ocr_compare as compare

In [2]:
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [3]:
GOLD_STD_PATH = consts.DATA_DIR / "sernec" / "gold_std_2022-06-28"

ARGS = SimpleNamespace(
    database=consts.DATA_DIR / "sernec" / "sernec.sqlite",
    gold_set="gold_set_2022-06-28",
    score_set="scores_2022-06-28",
    char_set="default",
)

## Setup the line align utility

In [4]:
matrix = subs.select_char_sub_matrix(char_set=ARGS.char_set)
ALIGN = line_align_py.LineAlign(matrix)

## Get gold standard and scores

In [5]:
GOLD_STD = compare.select_gold_std(ARGS.database, ARGS.gold_set)
GOLD_DICT = {g["gold_id"]: g for g in GOLD_STD}

In [6]:
scorer = compare.Scorer(ARGS)
SCORES = scorer.select_scores()

## Get pipeline total scores

In [7]:
PipelineScore = namedtuple("PipelineScore", "score pipeline")


def scores_by_pipeline(scores, gold_std):
    tally = defaultdict(int)

    for score in scores:
        tally[score["actions"]] += score["score"]

    tally = [(v, len(k), k) for k, v in tally.items()]
    tally = sorted(tally)

    return [PipelineScore(t[0], t[2]) for t in tally]


summed = scores_by_pipeline(SCORES, GOLD_DICT)
for sum_ in summed:
    print(sum_)

PipelineScore(score=2752, pipeline='[["deskew", "easyocr"], ["deskew", "tesseract"], ["binarize", "easyocr"], ["binarize", "tesseract"], ["post_process"]]')
PipelineScore(score=2889, pipeline='[["deskew", "easyocr"], ["deskew", "tesseract"], ["binarize", "easyocr"], ["binarize", "tesseract"]]')
PipelineScore(score=3322, pipeline='[["deskew", "easyocr"], ["deskew", "tesseract"], ["binarize", "easyocr"], ["binarize", "tesseract"], ["denoise", "easyocr"], ["denoise", "tesseract"], ["post_process"]]')
PipelineScore(score=3340, pipeline='[["deskew", "tesseract"], ["binarize", "tesseract"], ["denoise", "tesseract"]]')
PipelineScore(score=3410, pipeline='[["deskew", "tesseract"], ["binarize", "tesseract"], ["denoise", "tesseract"], ["post_process"]]')
PipelineScore(score=3661, pipeline='[["deskew", "easyocr"], ["deskew", "tesseract"], ["binarize", "easyocr"], ["binarize", "tesseract"], ["denoise", "easyocr"], ["denoise", "tesseract"]]')
PipelineScore(score=3829, pipeline='[["deskew", "easyocr

## Examine a pipeline

In [25]:
def examine(args, align, pipeline, n=5):
    sql = """
        select *
        from ocr_scores
        join gold_standard using (gold_id)
        where actions = ?
        and score_set = ?
        and ocr_scores.gold_set = ?
    """
    with db.connect(ARGS.database) as cxn:
        scores = db.execute(
            cxn, sql, [pipeline, args.score_set, args.gold_set]
        )
    scores = [dict(s) for s in scores]
    for score in scores:
        score['norm'] = round(score['score'] / len(score['gold_text']) * 100.0)
    scores = sorted(scores, key=lambda s: s['norm'], reverse=True)

    for score in scores[:n]:
        texts = [score['gold_text'], score['score_text']]
        texts = [' '.join(ln.split()) for ln in texts]
        aligned = align.align(texts)
        colored = [list(a) for a in aligned]
        for col in range(len(aligned[0])):
            col_chars = [aligned[row][col] for row in range(2)]
            if len(set(col_chars)) > 1:
                colored[0][col] = f'\033[2;31;34m{aligned[0][col]}\033[0;0m'
                colored[1][col] = f'\033[2;31;34m{aligned[1][col]}\033[0;0m'
        colored = [''.join(ln) for ln in colored]
        print(
            f"{score['score']=}\t{score['norm']=}\t"
            f"{score['gold_id']=}\t{score['score_id']=}\t{score['label_id']=}"
        )
        for ln in colored:
            print(ln)
        print()


pipeline = " ".join("""
["", "easyocr"], ["", "tesseract"],
["binarize", "easyocr"], ["binarize", "tesseract"],
["post_process"]
""".split()).strip()
pipeline = '[' + pipeline + ']'

examine(ARGS, ALIGN, pipeline)

score['score']=108	score['norm']=72	score['gold_id']=304	score['score_id']=121176	score['label_id']=235441
PLANTS O[2;31;34mF[0;0m THE GULF STATES. C[2;31;34mO[0;0mLL[2;31;34mE[0;0m[2;31;34mC[0;0m[2;31;34mT[0;0mE[2;31;34mD[0;0m[2;31;34m⋄[0;0m AND DISTRIBUTED B[2;31;34mY[0;0m S. M. T[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0mR[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2;31;34m⋄[0;0m[2