# Dataset preparation -- Normalized OCR output

## Read and index all entries

In [1]:
import os.path

In [2]:
DATASET_PATH = os.path.abspath("../../dataset")
sup_dir = os.path.join(DATASET_PATH, "supervised")
annot_file = os.path.join(sup_dir, "annotation_table.csv")
annot_file

'/home/joseph/git_github/soduco/paper-ner-bench-das22/dataset/supervised/annotation_table.csv'

In [3]:
import pandas as pd

In [4]:
annot_metadata = pd.read_csv(annot_file)
annot_metadata.head()

Unnamed: 0,#,PDF,VUE,LISTE,LETTRES,AUTEUR,TEXTE,TAGS,SOURCE,N ENTRÉES,N UNCHECKED,N CHECKED,ASSIGNÉ,TERMINÉ,Inclus dataset,COMMENTAIRE,CATEGORIE,Unnamed: 17
0,1,Bottin1_1820,107,ALPHABETIQUE,,,True,True,EchStrat_20211103,76.0,0.0,76,NATHALIE,True,True,76/76 checked.,"bottin, liste simple",
1,2,Bottin1_1820,201,ALPHABETIQUE,,,True,True,EchStrat_20211103,75.0,0.0,75,NATHALIE,True,True,75/75 checked.,"bottin, liste simple",
2,3,Bottin1_1820,339,PROFESSIONS,,,True,True,EchStrat_20211103,66.0,0.0,66,EDWIN,True,True,66/67 Layout ~ok en font-size=10,"bottin, liste simple",
3,4,Bottin1_1820,589,PROFESSIONS,,,True,True,EchStrat_20211103,86.0,24.0,62,EDWIN,True,True,"62/86 Layout ~ok en font-size=10, qualité horr...","bottin, liste simple",
4,5,Bottin1_1827,37,ALPHABETIQUE,,PASCAL,True,True,EchStrat_20210701,147.0,0.0,147,BERTRAND,True,True,147/148 checked.,"bottin, liste simple, decorations",


In [5]:
import json

In [6]:
all_entries_path = os.path.join(sup_dir, "10-ref-ocr-ner-json", "all.json")

In [7]:
all_entries = None
with open(all_entries_path) as infile:
    all_entries = json.load(infile)
len(all_entries)

8772

In [8]:
def e2idx(entry, book=None, page=None):
    book = entry["book"] if book is None else book
    page = entry["page"] if page is None else page
    return f'{book}-{page:04d}-{entry["id"]:04d}'
e2idx(all_entries[0])


'Bottin1_1820-0107-0286'

In [9]:
entries_index = {e2idx(e):e for e in all_entries if e["valid_box"]}

## Load predicted OCR

In [10]:
import os.path

In [11]:
metadata_index = {
    idx: {"book": book, "page": page, "keep": in_dataset} 
    for (idx, book, page, in_dataset) 
    in  zip(annot_metadata["#"], annot_metadata["PDF"], annot_metadata["VUE"], annot_metadata["Inclus dataset"])
    }
list(metadata_index.items())[:3]

[(1, {'book': 'Bottin1_1820', 'page': 107, 'keep': True}),
 (2, {'book': 'Bottin1_1820', 'page': 201, 'keep': True}),
 (3, {'book': 'Bottin1_1820', 'page': 339, 'keep': True})]

In [12]:
import numpy as np

In [13]:
def load_filter_ocr_pred(data_dir, entries_index, metadata_index) -> dict:
    results = []
    matched_ref = set()
    skipped_entries = 0
    for file_id, meta in metadata_index.items():
        print(f"Processing {meta['book']}/{meta['page']}")
        if not meta["keep"]:
            print("\t → Skipping (not in dataset)")
            continue
        file_path = os.path.join(data_dir, f"{file_id:04d}.json")
        content = None
        with open(file_path) as in_file:
            content = json.load(in_file)
        for elem in content:
            if elem["type"] != "ENTRY":
                continue
            idx = e2idx(elem, book=meta["book"], page=meta["page"])
            if idx not in entries_index:
                skipped_entries += 1
                print(f"\tSkipping entry not in reference or with bad box")
                print(f"\t {elem}")
                continue
            # if not entries_index[idx]["valid_box"]:
            #     skipped_entries += 1
            #     print(f"\tSkipping entry with bad box")
            #     print(f"\t {elem}")
            #     continue
            # ref_box = np.round(entries_index[idx]["box"])
            # if not np.allclose(ref_box, np.round(elem["box"])):
            ref_box = [int(e) for e in entries_index[idx]["box"]]
            cur_box = [int(e) for e in elem["box"]]
            if ref_box != cur_box:
                print("!!Wrong box, should not happen! Layout is different.!!")
                print("ref_box", ref_box)
                print("cur_box", cur_box)
                print("Reference:")
                print(f"{entries_index[idx]}")
                print("Current predicted entry:")
                print(f"{elem}")
                print("!!Aborting!!")
                raise ValueError("Incompatible layout.")
            if idx in matched_ref:
                print(f"!!ERROR: already matched entry {idx}")
                raise RuntimeError("Fix your code or your data.")
            
            matched_ref.add(idx)
            new_elem = {k:elem[k] for k in ("id", "text_ocr")}
            new_elem["box"] = ref_box
            for k in ("book", "page"):
                new_elem[k] = meta[k]
            results.append(new_elem)
    
    print("=============================")
    print("Finished. Reports:")
    missed_refs = set(entries_index.keys()) - matched_ref
    if len(missed_refs) > 0:
        print("!!ERROR: some refs were left unmatched:!!")
        print(missed_refs)
    else:
        print("We found matching entries for all references.")

    print(f"Skipped entries: {skipped_entries}")

    return results
        

## PERO

In [14]:
pero_path = os.path.abspath("../../dataset/supervised/00-ocr-pero-raw-json")
pero_path

'/home/joseph/git_github/soduco/paper-ner-bench-das22/dataset/supervised/00-ocr-pero-raw-json'

In [15]:
pero_entries_raw = load_filter_ocr_pred(pero_path, entries_index, metadata_index)
len(pero_entries_raw)

Processing Bottin1_1820/107
	Skipping entry not in reference or with bad box
	 {'activities': [], 'addresses': [{'street_name': 'r. de la Bucherie', 'street_numbers': ['14. 4o2']}], 'checked': True, 'comment': '', 'id': 276, 'ner_xml': '<PER>Dudoit</PER>, <ACT>maçon</ACT>, <LOC>r. de la Bucherie</LOC>, <CARDINAL>14</CARDINAL>. 402', 'origin': 'computer', 'persons': ['Dailoit'], 'text_ocr': 'Dmloit, maçon, r. de la Bucherie, 14: 402', 'box': [122, 200, 387, 23], 'type': 'ENTRY', 'parent': 268}
	Skipping entry not in reference or with bad box
	 {'activities': [], 'addresses': [{'street_name': '*. O.eNEarUn', 'street_numbers': ['20', '3']}, {'street_name': 'es des Mauvaises', 'street_numbers': []}], 'checked': True, 'comment': '', 'id': 306, 'ner_xml': '<PER>Dufay</PER>, <ACT>papetier</ACT>, <LOC>r. S.-Martin</LOC>, <CARDINAL>20</CARDINAL>. <CARDINAL>437</CARDINAL>', 'origin': 'computer', 'persons': ['AAay .'], 'text_ocr': '', 'box': [517, 560, -2, 17], 'type': 'ENTRY', 'parent': 281}
	Sk

8765

In [16]:
from text_utils import (
    simplify_unicode_charset,
    check_alignment_charset, 
)

In [17]:
def clean_ocr_entries(entries_raw):
    entries_final = []
    for e in entries_raw:
        e_new = {}
        for k in e:
            e_new[k] = e[k]
        e_new["text_ocr"] = simplify_unicode_charset(e["text_ocr"].strip())
        if not check_alignment_charset(e_new["text_ocr"]):
            raise ValueError(e_new)
        entries_final.append(e_new)
    return entries_final

In [18]:
pero_entries_final = clean_ocr_entries(pero_entries_raw)
len(pero_entries_final)

Suspect char@060: † (0x2020 DAGGER -- cat.: Po)
Suspect char@028: † (0x2020 DAGGER -- cat.: Po)
Suspect char@011: † (0x2020 DAGGER -- cat.: Po)
Suspect char@007: † (0x2020 DAGGER -- cat.: Po)
Suspect char@002: к (0x43a CYRILLIC SMALL LETTER KA -- cat.: Ll)
Suspect char@027: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Suspect char@015: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Suspect char@009: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Suspect char@020: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Suspect char@011: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@018: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Suspect char@010: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Suspect char@008: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Suspect char@012: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Suspect char@035: и (0x438 CYRILLIC SMALL LETTER I -- cat.: Ll)
Suspect char@037: н (0x43d CYRILLIC SMALL LETTER EN -- cat.:

8765

In [19]:
pero_out_path = os.path.join(sup_dir, "21-ocr-pero-final", "all.json")

In [20]:
with open(pero_out_path, "w") as out_file:
    json.dump(pero_entries_final, out_file, indent=2)

In [21]:
!head $pero_out_path

[
  {
    "id": 286,
    "text_ocr": "Dufau et Cl\u00e9mendot, pharmaciens, r. de la\nCh\u00e4ussee-d Antin.\n\n. JEII",
    "box": [
      127,
      319,
      385,
      38
    ],


## Tesseract

In [22]:
tess_path = os.path.abspath("../../dataset/supervised/00-ocr-tess-raw-json")
tess_path

'/home/joseph/git_github/soduco/paper-ner-bench-das22/dataset/supervised/00-ocr-tess-raw-json'

In [23]:
tess_entries_raw = load_filter_ocr_pred(tess_path, entries_index, metadata_index)
len(tess_entries_raw)

Processing Bottin1_1820/107
	Skipping entry not in reference or with bad box
	 {'activities': [], 'addresses': [{'street_name': 'r. de la Bucherie', 'street_numbers': ['14. 4o2']}], 'checked': True, 'comment': '', 'id': 276, 'ner_xml': '<PER>Dudoit</PER>, <ACT>maçon</ACT>, <LOC>r. de la Bucherie</LOC>, <CARDINAL>14</CARDINAL>. 402', 'origin': 'computer', 'persons': ['Dailoit'], 'text_ocr': 'Dndoit, maçon, r. de la Bacherie, 14 4o\n', 'box': [122, 200, 387, 23], 'type': 'ENTRY', 'parent': 268}
	Skipping entry not in reference or with bad box
	 {'activities': [], 'addresses': [{'street_name': '*. O.eNEarUn', 'street_numbers': ['20', '3']}, {'street_name': 'es des Mauvaises', 'street_numbers': []}], 'checked': True, 'comment': '', 'id': 306, 'ner_xml': '<PER>Dufay</PER>, <ACT>papetier</ACT>, <LOC>r. S.-Martin</LOC>, <CARDINAL>20</CARDINAL>. <CARDINAL>437</CARDINAL>', 'origin': 'computer', 'persons': ['AAay .'], 'text_ocr': 'Dufay, papetier, r. S.-Martin, 20. 437', 'box': [517, 560, -2, 17

8765

In [24]:
tess_entries_final = clean_ocr_entries(tess_entries_raw)
len(tess_entries_final)

Suspect char@048: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@043: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@020: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@008: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@023: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@001: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@014: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@017: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@014: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@037: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@011: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@011: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@009: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@013: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@008: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@681: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@021: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@020: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@081: € (0x20ac EURO SIGN -- cat.: Sc)
Suspect char@006: € (0x20ac EUR

8765

In [25]:
tess_out_path = os.path.join(sup_dir, "22-ocr-tess-final", "all.json")

In [26]:
with open(tess_out_path, "w") as out_file:
    json.dump(tess_entries_final, out_file, indent=2)

In [27]:
!head $tess_out_path

[
  {
    "id": 286,
    "text_ocr": "Dafan et Gl\u00e9mendot ; pharmaciens ; +. de la\nCh\u00e4uss\u00e9e-d'Antin . 32. (Elis.) .hl&",
    "box": [
      127,
      319,
      385,
      38
    ],
