# Prepare dataset - NER filter valid entries
We want to filter the subset of entries for which the alignment of reference tags was successful (i.e. no empty tags were generated) **for all OCR outputs ate the same time.**

We will generate a subset with these entries suitable for stratified train/val/test split for experiment 2 (comparison of NER performance on different OCR outputs).

In [1]:
import os.path

In [2]:
DATASET_PATH = os.path.abspath("../../dataset/supervised")
DATASET_PATH

'/home/joseph/git_github/soduco/paper-ner-bench-das22/dataset/supervised'

In [3]:
ref_path = os.path.join(DATASET_PATH, "10-ref-ocr-ner-json", "all.json")
pero_path = os.path.join(DATASET_PATH, "31-ner_align_pero", "all.json")
tess_path = os.path.join(DATASET_PATH, "32-ner_align_tess", "all.json")

In [4]:
import json
import io

In [5]:
def merge_filter_ner_gt(ref_path, pero_path, tess_path, csv_path) -> list:
    with open(ref_path) as f:
        reference = json.load(f)
    with open(pero_path) as f:
        pero = json.load(f)
    with open(tess_path) as f:
        tess = json.load(f)

    uid = lambda x: (x["book"], x["page"], x["id"]) 
    ref = { uid(x) : x["ner_xml"] for x in reference}
    pero = { uid(x) : x["ner_xml"] for x in pero if x["has_valid_ner_xml"]}
    tess = { uid(x) : x["ner_xml"] for x in tess if x["has_valid_ner_xml"]}

    valid_keys = set(ref.keys()) & set(pero.keys()) & set(tess.keys())

    lines_to_write = []
    for key, ner_xml_ref in ref.items():
        ner_xml_pero = pero.get(key)
        ner_xml_tess = tess.get(key)
        if ner_xml_pero is None or ner_xml_tess is None:
            # print(" -- Skipping entry")
            continue
        book, _page, _id = key

        line = '"' + '", "'.join((ner_xml_ref, ner_xml_pero, ner_xml_tess, book)) + '"'
        lines_to_write.append(line)
    
    print("# filtered entries:", len(lines_to_write))
    all_lines = "\n".join(lines_to_write)
    with io.open(csv_path, "wt", encoding="UTF-8", newline='',
                errors="strict") as file_output:
        header = '"' + '", "'.join(("ner_xml_ref", "ner_xml_pero", "ner_xml_tess", "book")) + '"'
        file_output.write(header + "\n")
        file_output.write(all_lines)

In [6]:
out_path_csv = os.path.join(DATASET_PATH, "40-ner_aligned_valid_subset", "gold.csv")
ner_filtered = merge_filter_ner_gt(ref_path, pero_path, tess_path, out_path_csv)

# filtered entries: 8341
