# Dataset export
Gather all independent JSON files into a big single one.

## Path definitions

In [1]:
from pathlib import Path

In [2]:
path_ref = Path("10-ner_ref/all_with_nested_ner_2023.json")
path_pero = Path("31-ner_align_pero/all.json")
path_tess = Path("32-ner_align_tess/all.json")

## Load all files

In [3]:
import json

In [4]:
def load_data(filename: Path) -> list:
    with open(filename, mode='r', encoding="utf8") as file_in:
        return json.load(file_in)

In [5]:
data_ref, data_pero, data_tess = [load_data(filename) for filename in (path_ref, path_pero, path_tess)]

In [6]:
len(data_ref), len(data_pero), len(data_tess)

(8772, 8765, 8765)

## Merge dataset parts

In [7]:
dataset_parts = {
    "pero": data_pero,
    "tess": data_tess,
}

In [8]:
def merge_datasets(data_ref: list, dataset_parts: dict[str, list]) -> list:
    """Merge several dataset parts together.

    Args:
        data_ref (list): base dataset to join on
        dataset_parts (dict[str, list]): name -> entries

    Returns:
        list: merged parts
    """
    # Keep only elements with valid content (filters 7 buggy entries)
    result = [e for e in data_ref if e["valid_box"]]
    # Rename keys for text_ocr and nested_ner_xml
    for e in result:
        e["nested_ner_xml_ref"] = e["nested_ner_xml"]
        del e["nested_ner_xml"]
        e["text_ocr_ref"] = e["text_ocr"]
        del e["text_ocr"]
        # delete flat reference from original dataset (not the one used for evaluation here)
        del e["ner_xml"]
    # Build fast access index
    ref_index = {(e["book"], e["page"], e["id"]):e for e in result}
    for name, data in dataset_parts.items():
        for e2 in data:
            e1 = ref_index[e2["book"], e2["page"], e2["id"]]
            for field in ("text_ocr", "nested_ner_xml", "has_valid_ner_xml"):
                e1[f"{field}_{name}"] = e2[field]
    return result

In [9]:
dataset_full = merge_datasets(data_ref, dataset_parts)

In [10]:
len(dataset_full)

8765

In [11]:
dataset_full[0]

{'id': 286,
 'box': [127.59990697081633,
  319.9430680456292,
  385.6102733261905,
  38.49245993571395],
 'book': 'Bottin1_1820',
 'page': 107,
 'valid_box': True,
 'nested_ner_xml_ref': "<PER>Dufan et Clémendot</PER>, <ACT>pharmaciens</ACT>, <SPAT><LOC>r. de la\u2029Chaussée-d'Antin</LOC>, <CARDINAL>34</CARDINAL></SPAT>. <TITRE>(Elig.)</TITRE> 449",
 'text_ocr_ref': "Dufan et Clémendot, pharmaciens, r. de la\nChaussée-d'Antin, 34. (Elig.) 449",
 'text_ocr_pero': 'Dufau et Clémendot, pharmaciens, r. de la\nChäussee-d Antin.\n\n. JEII',
 'nested_ner_xml_pero': '<PER>Dufau et Clémendot</PER>, <ACT>pharmaciens</ACT>, <SPAT><LOC>r. de la\u2029Chäussee-d Antin</LOC>.\u2029<CARDINAL>\u2029</CARDINAL></SPAT>. <TITRE>JEII</TITRE>',
 'has_valid_ner_xml_pero': False,
 'text_ocr_tess': "Dafan et Glémendot ; pharmaciens ; +. de la\nChäussée-d'Antin . 32. (Elis.) .hl&",
 'nested_ner_xml_tess': '<PER>Dafan et Glémendot</PER> ; <ACT>pharmaciens</ACT> ; <SPAT><LOC>+. de la\u2029Chäussée-d&apos;Antin</

## Sanity check on XML escaping

In [12]:
import re

In [15]:
# look for insolated "&" not in XML entity
PAT_ERR = r"&(?!(apos|quot|gt|lt|amp);)" 
def check_xml_entities(dataset: list):
    for e in dataset:
        for key in ("nested_ner_xml_ref", "nested_ner_xml_pero", "nested_ner_xml_tess"):
            content = e[key]
            # if "\"" in content:
            #     print("ERROR: quote in entry:")
            #     print(e)
            #     break
            if re.match(PAT_ERR, content):
                print("ERROR: unescaped XML special char in entry:")
                print(e)
                break
    print("All done.")

In [16]:
check_xml_entities(dataset_full)

All done.


## Export the final file

In [18]:
OUTPUT_FILE = "dataset_full.json"
with open(OUTPUT_FILE, mode="w", encoding="utf8") as out_file:
    json.dump(dataset_full, out_file, indent=2)