# Dataset preparation -- Normalized OCR output
# Special case for Kraken output

## Read and index all entries

In [1]:
import os.path

In [2]:
DATASET_PATH = os.path.abspath("../../dataset")
sup_dir = os.path.join(DATASET_PATH, "supervised")
kraken_path = os.path.join(sup_dir, "01-ocr-kraken-raw-json", "all.json")

In [3]:
import pandas as pd

In [4]:
import json

In [5]:
entries_ref_path = os.path.join(sup_dir, "10-ref-ocr-ner-json", "all.json")

In [6]:
entries_ref = None
with open(entries_ref_path) as infile:
    entries_ref = json.load(infile)
len(entries_ref)

8772

In [7]:
uid = lambda x: (x["book"], x["page"], x["id"]) 

In [8]:
entries_index = {uid(e):e for e in entries_ref if e["valid_box"]}
len(entries_index.keys())
del entries_ref

## Load predicted OCR

In [9]:
raw_entries = None
with open(kraken_path) as infile:
    raw_entries = json.load(infile)
print(f"Loaded {len(raw_entries)} for Kraken.")

Loaded 8766 for Kraken.


In [10]:
raw_entries[0]

{'book': 'Notables_communaux_seine_1801',
 'page': 144,
 'id': 351,
 'text_ocr': 'Lamarre, clerc de notaire, rue galite.'}

In [11]:
# Sanity check
krak_idx = {uid(e):e for e in raw_entries}
# No duplicates
assert(len(krak_idx.keys()) == len(raw_entries))
# At least all entries in reference
assert(krak_idx.keys() >= entries_index.keys())

In [12]:
# Extra elements (not a problem)
krak_idx.keys() - entries_index.keys()

{('Bottin1_1820', 107, 331)}

In [13]:
# Missing elements
entries_index.keys() - krak_idx.keys()

set()

In [14]:
results = []
matched_ref = set()
skipped_entries = 0

from text_utils import (
    simplify_unicode_charset,
    check_alignment_charset, 
)


for idx, krak_entry in krak_idx.items():
    if idx not in entries_index:
        skipped_entries += 1
        print(f"\tSkipping entry not in reference or with bad box")
        print(f"\t {krak_entry}")
        continue
    ref_entry = entries_index[idx]

    # # No control export of the box for Kraken
    ref_box = [int(e) for e in ref_entry["box"]]
    # cur_box = [int(e) for e in krak_entry["box"]]
    # if ref_box != cur_box:
    #     print("!!Wrong box, should not happen! Layout is different.!!")
    #     print("ref_box", ref_box)
    #     print("cur_box", cur_box)
    #     print("Reference:")
    #     print(f"{ref_entry}")
    #     print("Current predicted entry:")
    #     print(f"{krak_entry}")
    #     print("!!Aborting!!")
    #     raise ValueError("Incompatible layout.")
    
    new_elem = {k:krak_entry[k] for k in ("id", "book", "page")}
    new_elem["box"] = ref_box
    new_elem["text_ocr"] = simplify_unicode_charset(krak_entry["text_ocr"].strip())
    if not check_alignment_charset(new_elem["text_ocr"]):
        raise ValueError(new_elem)
    results.append(new_elem)

print("=============================")
print("Finished. Reports:")
print(f"Skipped entries: {skipped_entries}")
missed_entries = entries_index.keys() - krak_idx.keys()
print(f"Missed entries: {len(missed_entries)}")
if len(missed_entries) > 0:
    raise ValueError("Missing entries.")


	Skipping entry not in reference or with bad box
	 {'book': 'Bottin1_1820', 'page': 107, 'id': 331, 'text_ocr': ''}
Finished. Reports:
Skipped entries: 1
Missed entries: 0


In [15]:
krak_out_path = os.path.join(sup_dir, "23-ocr-krak-final", "all.json")

In [16]:
with open(krak_out_path, "w") as out_file:
    json.dump(results, out_file, indent=2)

In [17]:
!head $krak_out_path

[
  {
    "id": 351,
    "book": "Notables_communaux_seine_1801",
    "page": 144,
    "box": [
      267,
      1377,
      444,
      26
