# Prepare dataset - NER ground truth alignment

In [1]:
import os.path

In [2]:
DATASET_PATH = os.path.abspath("../../dataset/supervised")
DATASET_PATH

'/home/joseph/git_github/soduco/paper-ner-bench-das22/dataset/supervised'

In [3]:
ref_path = os.path.join(DATASET_PATH, "10-ref-ocr-ner-json", "all.json")
pero_path = os.path.join(DATASET_PATH, "21-ocr-pero-final", "all.json")
tess_path = os.path.join(DATASET_PATH, "22-ocr-tess-final", "all.json")
krak_path = os.path.join(DATASET_PATH, "23-ocr-krak-final", "all.json")

In [4]:
import json
import io

In [5]:
def e2idx(entry, book=None, page=None):
    book = entry["book"] if book is None else book
    page = entry["page"] if page is None else page
    return f'{book}-{page:04d}-{entry["id"]:04d}'

In [6]:
from text_utils import add_tags_prediction, xml_contains_empty_tags

In [7]:
def generate_aligned_gt(json_path_ref, json_path_target, output_file):
    entries_ref = None
    invalid_entries = 0
    entries = 0
    with open(json_path_ref) as in_file:
        entries_ref = json.load(in_file)
    index_ref = {e2idx(e):e for e in entries_ref}

    entries_target = None
    with open(json_path_target) as in_file:
        entries_target = json.load(in_file)
    index_target = {e2idx(e):e for e in entries_target}

    new_entries = []

    for idx, entry_target in index_target.items():
        entries += 1
        new_entry = {k:entry_target[k] for k in ("id", "book", "page", "text_ocr")}
        entry_ref = index_ref[idx]

        if not entry_ref["valid_box"]:
            raise RuntimeError("We should not have entries with invalid boxes here.")
    
        xml_ref = entry_ref["ner_xml"]
        txt_target = entry_target["text_ocr"]
        xml_target = add_tags_prediction(xml_ref, txt_target, debug=False)
        new_entry["ner_xml"] = xml_target
        has_valid_ner_xml = not xml_contains_empty_tags(xml_target)
        new_entry["has_valid_ner_xml"] = has_valid_ner_xml
        if not has_valid_ner_xml:
            invalid_entries += 1
            print("Invalid entry, empty tags.")
            print(f"\t{xml_target}")
            print("")
            # we still append the entry for diagnosis and counting
        new_entries.append(new_entry)
    
    with open(output_file, 'w') as out_file:
        json.dump(new_entries, out_file, indent=2)
    
    print("================================")
    print(f"Invalid entries: {invalid_entries:6d}")
    print(f"Total entries:   {entries:6d}")
    print("================================")

    return new_entries


In [8]:
out_path = os.path.join(DATASET_PATH, "31-ner_align_pero", "all.json")
ner_pero_align = generate_aligned_gt(ref_path, pero_path, out_path)

Invalid entry, empty tags.
	<PER>Dufau et Clémendot</PER>, <ACT>pharmaciens</ACT>, <LOC>r. de la Chäussee-d Antin</LOC>. <CARDINAL> </CARDINAL>. <TITRE>JEII</TITRE>

Invalid entry, empty tags.
	<PER>Dufour</PER>, <ACT>march. de planches</ACT>, <LOC>r. Chapon</LOC>,<CARDINAL> </CARDINAL> 20.

Invalid entry, empty tags.
	<PER>Dufresne (Mad.)</PER>, <ACT>lingère</ACT>, <LOC>r. S. Denis</LOC>.<CARDINAL></CARDINAL>

Invalid entry, empty tags.
	<PER>Dufour</PER>, <ACT>march. de vins</ACT>, <LOC>r. Ste.-Margnerite. S.-German</LOC> o<CARDINAL></CARDINAL>.

Invalid entry, empty tags.
	<PER>Michonnet</PER>; <ACT>Maçon</ACT>, <LOC>petile r. Taranne</LOC>, <CARDINAL></CARDINAL> 403

Suspect char@060: † (0x2020 DAGGER -- cat.: Po)
Suspect char@028: † (0x2020 DAGGER -- cat.: Po)
Invalid entry, empty tags.
	<PER>Miguet jeune</PER>, <ACT>épicier</ACT>, <LOC>r. Neuve-S.-Martin</LOC>,<CARDINAL></CARDINAL>

Invalid entry, empty tags.
	<PER>Milcent</PER>, <ACT>peintre en bátimens</ACT>, <LOC>r. Buffankt</

In [9]:
out_path = os.path.join(DATASET_PATH, "32-ner_align_tess", "all.json")
ner_tess_align = generate_aligned_gt(ref_path, tess_path, out_path)

Suspect char@048: € (0x20ac EURO SIGN -- cat.: Sc)
Invalid entry, empty tags.
	<PER>Dafourmantelle (Lonis)</PER>, <ACT>peaussier</ACT>, <LOC>r. dn DR es + FR</LOC> <CARDINAL></CARDINAL>4e

Invalid entry, empty tags.
	&apos;mn: em <PER>Dudngnon</PER>, <ACT>march. deplâtre</ACT>, <LOC>x. des Murais- ! SGermoio</LOC> set <CARDINAL></CARDINAL>, 403

Invalid entry, empty tags.
	<PER>Migeon</PER>, <ACT>charpentier</ACT>, <LOC>x, de Vaugirard</LOC> <CARDINAL></CARDINAL>; pa

Invalid entry, empty tags.
	<PER></PER>, <ACT>ve</ACT>. <LOC>des ee Conte</LOC>, <CARDINAL>e</CARDINAL><LOC>s</LOC>

Invalid entry, empty tags.
	<PER>&quot;PSEue, (François)</PER>; <ACT>comruiss.</ACT>, <LOC>4 die</LOC><CARDINAL></CARDINAL>

Invalid entry, empty tags.
	! <PER>Miguot</PER>, <ACT>hôtel garni</ACT>, <LOC>% de la Jamienng</LOC>,<CARDINAL></CARDINAL>

Invalid entry, empty tags.
	<PER>Miguet jeune</PER>, <ACT>épécier</ACT>, <LOC>E ere se</LOC><CARDINAL></CARDINAL>

Suspect char@043: € (0x20ac EURO SIGN -- cat.:

In [10]:
out_path = os.path.join(DATASET_PATH, "33-ner_align_krak", "all.json")
ner_krak_align = generate_aligned_gt(ref_path, krak_path, out_path)

Invalid entry, empty tags.
	<PER>Buzenet</PER>, <LOC>rue galande</LOC>, n.<CARDINAL></CARDINAL>.

Invalid entry, empty tags.
	<PER>Thiriet</PER> , <LOC>rue Christie</LOC>,<CARDINAL></CARDINAL>

Invalid entry, empty tags.
	<PER>Vadame, cte d&apos;Unsebowut</PER>, <LOC>ue d vat-lllae</LOC>  <CARDINAL></CARDINAL>.

Invalid entry, empty tags.
	<PER>Walleville</PER>, <LOC>rne Serv ndoni</LOC>, <CARDINAL></CARDINAL>.

Invalid entry, empty tags.
	<PER>ali besnoyers</PER>, <LOC>rue Cloche-Perche</LOC> , <CARDINAL></CARDINAL>.

Invalid entry, empty tags.
	<PER>Val:yer</PER>, <ACT>cure de oint-Germain-l&apos;fu- errois</ACT>, <LOC>ue de l&apos;Arbre-Sec</LOC>, <CARDINAL></CARDINAL>.

Invalid entry, empty tags.
	r...- <PER>Truvault</PER>, <LOC>rue del MIicholiire</LOC>. <CARDINAL></CARDINAL>.

Invalid entry, empty tags.
	<PER>r outau</PER> , <LOC>rue licher</LOC>, <CARDINAL></CARDINAL>.

Invalid entry, empty tags.
	r..- <PER>Tourolle</PER>, <LOC>rue icelieu</LOC> , <CARDINAL></CARDINAL>.

Invalid

In [11]:
import io

In [12]:
def export_gold(entries, csv_path):
    lines_to_write = []
    for entry in entries:
        if not entry["has_valid_ner_xml"]:
            continue
        line = '"' + entry['ner_xml'] + '", "' + entry['book'] + '"'
        lines_to_write.append(line)
    print(f"Valid entries: {len(lines_to_write)}")
    all_lines = "\n".join(lines_to_write)

    with io.open(csv_path, "wt", encoding="UTF-8", newline='',
                errors="strict") as file_output:
        file_output.write(all_lines)

In [13]:
out_path_csv = os.path.join(DATASET_PATH, "31-ner_align_pero", "gold.csv")
export_gold(ner_pero_align, out_path_csv)

Valid entries: 8392


In [14]:
out_path_csv = os.path.join(DATASET_PATH, "32-ner_align_tess", "gold.csv")
export_gold(ner_tess_align, out_path_csv)

Valid entries: 8700


In [15]:
out_path_csv = os.path.join(DATASET_PATH, "33-ner_align_krak", "gold.csv")
export_gold(ner_krak_align, out_path_csv)

Valid entries: 7990
