# Dataset preparation -- OCR and NER reference

In [1]:
import os.path

In [2]:
DATASET_PATH = os.path.abspath("../../dataset")
DATASET_PATH

'/home/joseph/git_github/soduco/paper-ner-bench-das22/dataset'

In [3]:
import pandas as pd

In [4]:
sup_dir = os.path.join(DATASET_PATH, "supervised")
annot_file = os.path.join(sup_dir, "annotation_table.csv")
annot_file

'/home/joseph/git_github/soduco/paper-ner-bench-das22/dataset/supervised/annotation_table.csv'

In [5]:
!head $annot_file

#,PDF,VUE,LISTE,LETTRES,AUTEUR,TEXTE,TAGS ,SOURCE,N ENTRÉES,N UNCHECKED,N CHECKED,ASSIGNÉ,TERMINÉ,Inclus dataset,COMMENTAIRE,CATEGORIE,
1,Bottin1_1820,107,ALPHABETIQUE,,,TRUE,TRUE,EchStrat_20211103,76,0,76,NATHALIE,TRUE,TRUE,76/76 checked.,"bottin, liste simple",
2,Bottin1_1820,201,ALPHABETIQUE,,,TRUE,TRUE,EchStrat_20211103,75,0,75,NATHALIE,TRUE,TRUE,75/75 checked.,"bottin, liste simple",
3,Bottin1_1820,339,PROFESSIONS,,,TRUE,TRUE,EchStrat_20211103,66,0,66,EDWIN,TRUE,TRUE,66/67 Layout ~ok en font-size=10,"bottin, liste simple",
4,Bottin1_1820,589,PROFESSIONS,,,TRUE,TRUE,EchStrat_20211103,86,24,62,EDWIN,TRUE,TRUE,"62/86 Layout ~ok en font-size=10, qualité horrible.","bottin, liste simple",
5,Bottin1_1827,37,ALPHABETIQUE,,PASCAL,TRUE,TRUE,EchStrat_20210701,147,0,147,BERTRAND,TRUE,TRUE,147/148 checked.,"bottin, liste simple, decorations",
6,Bottin1_1827,117,ALPHABETIQUE,,LAUREENCIA,TRUE,TRUE,EchStrat_20210701,138,0,138,BERTRAND,TRUE,TRUE,138/139 checked.,"bottin, liste simple, decorations

In [6]:
annot_metadata = pd.read_csv(annot_file)
annot_metadata

Unnamed: 0,#,PDF,VUE,LISTE,LETTRES,AUTEUR,TEXTE,TAGS,SOURCE,N ENTRÉES,N UNCHECKED,N CHECKED,ASSIGNÉ,TERMINÉ,Inclus dataset,COMMENTAIRE,CATEGORIE,Unnamed: 17
0,1,Bottin1_1820,107,ALPHABETIQUE,,,True,True,EchStrat_20211103,76.0,0.0,76,NATHALIE,True,True,76/76 checked.,"bottin, liste simple",
1,2,Bottin1_1820,201,ALPHABETIQUE,,,True,True,EchStrat_20211103,75.0,0.0,75,NATHALIE,True,True,75/75 checked.,"bottin, liste simple",
2,3,Bottin1_1820,339,PROFESSIONS,,,True,True,EchStrat_20211103,66.0,0.0,66,EDWIN,True,True,66/67 Layout ~ok en font-size=10,"bottin, liste simple",
3,4,Bottin1_1820,589,PROFESSIONS,,,True,True,EchStrat_20211103,86.0,24.0,62,EDWIN,True,True,"62/86 Layout ~ok en font-size=10, qualité horr...","bottin, liste simple",
4,5,Bottin1_1827,37,ALPHABETIQUE,,PASCAL,True,True,EchStrat_20210701,147.0,0.0,147,BERTRAND,True,True,147/148 checked.,"bottin, liste simple, decorations",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,91,Panckoucke_comm_1820,732,ALPHABETIQUE,,,False,False,EchStrat_20210701,,,0,(personne),False,False,illisible cf. ci-dessus,,
91,92,Panckoucke_hab_Dulac_1820,184,ALPHABETIQUE,,,False,False,EchStrat_20210701,,,0,(personne),False,False,illisible cf. ci-dessus,,
92,93,Panckoucke_hab_Dulac_1820,330,ALPHABETIQUE,,,False,False,EchStrat_20210701,,,0,(personne),False,False,illisible cf. ci-dessus,,
93,94,Panckoucke_hab_Dulac_1820,676,ALPHABETIQUE,,,False,False,EchStrat_20210701,,,0,(personne),False,False,illisible cf. ci-dessus,,


In [7]:
json_dir = os.path.join(sup_dir, "00-dataset_article_das22.pdf")

In [8]:
import json
from text_utils import (
    simplify_unicode_charset, 
    replace_annotation_codes, 
    fix_manual_ner_xml, 
    check_alignment_charset, 
    remove_xml_tags_and_entities)

In [20]:
dataset = []
all_raw_ocr_txt = []
all_new_ocr_txt = []
limit = 10000000
count = 0
lost_entries = 0
for item_id, book, page, in_dataset in zip(
        annot_metadata["#"], annot_metadata["PDF"], annot_metadata["VUE"], annot_metadata["Inclus dataset"]):
    print(item_id, book, page, in_dataset)
    if not in_dataset:
        print("\t >> SKIPPING (not in dataset)")
        continue
    content = None
    with open(os.path.join(json_dir, f"{item_id:04d}.json")) as infile:
        content = json.load(infile)
    for elem in content:
        if count > limit:
            break
        if elem["type"] != "ENTRY":
            continue
        if "checked" not in elem or not elem["checked"]:
            continue
        count += 1

        newelem = {}
        for k in ["id", "box"]:
            newelem[k] = elem[k]
        newelem["book"] = book
        newelem["page"] = page
        orig_ocr_txt = elem["text_ocr"].strip()
        new_ocr_txt = simplify_unicode_charset(replace_annotation_codes(orig_ocr_txt))
        newelem["text_ocr"] = new_ocr_txt
        if not check_alignment_charset(new_ocr_txt):
            print(f"Bad charset for {book}/{page}/{elem['id']}")
        
        orig_ner_xml = elem["ner_xml"].strip()
        new_ner_xml = fix_manual_ner_xml(orig_ner_xml)
        newelem["ner_xml"] = new_ner_xml

        orig_ner_text = remove_xml_tags_and_entities(orig_ner_xml)
        if not orig_ner_text == orig_ocr_txt:
            print("ERROR: NER and OCR don't match.")
            print(f"ERROR: OCR: \t{orig_ocr_txt}")
            print(f"ERROR: NER: \t{orig_ner_text}")
            lost_entries += 1
            continue

        dataset.append(newelem)
        all_raw_ocr_txt.append(orig_ocr_txt)
        all_new_ocr_txt.append(new_ocr_txt)
        

1 Bottin1_1820 107 True
ERROR: NER and OCR don't match.
ERROR: OCR: 	Dmloit, maçon, r. de la Bucherie, 14: 402
ERROR: NER: 	Dudoit, maçon, r. de la Bucherie, 14. 402
ERROR: NER and OCR don't match.
ERROR: OCR: 	Duez, mienuisier, r. de la Tixeranderie, 51.
p
ZO
ERROR: NER: 	Duez, menuisier, r. de la Tiseranderie, 51.
408
ERROR: NER and OCR don't match.
ERROR: OCR: 	Duflos, horloger, r. du Faub.-S.-Marlin
O.
S1
ERROR: NER: 	Duflos, horloger, r. du Faub.-S.-Martin,
60. 375
ERROR: NER and OCR don't match.
ERROR: OCR: 	Duflos, papetier, r. Poissonniète ; 19. 437
ERROR: NER: 	Duflos, papetier , r. Poissonnière, 10. 437
2 Bottin1_1820 201 True
ERROR: NER and OCR don't match.
ERROR: OCR: 	Milet (V.e), md. de papiers peints, boulev.
S.-Martin, 39. 440
ERROR: NER: 	Milet (V.°), md. de papiers peints, boulev.
S.-Martin, 39. 440
3 Bottin1_1820 339 True
4 Bottin1_1820 589 True
5 Bottin1_1827 37 True
6 Bottin1_1827 117 True
ERROR: NER and OCR don't match.
ERROR: OCR: 	Devaudichon, anc. commiss. cont

In [21]:
lost_entries

65

## Summary of incorrect chars in annotated content
```
1 Bottin1_1820 107 True
2 Bottin1_1820 201 True
3 Bottin1_1820 339 True
4 Bottin1_1820 589 True
Suspect char@007: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Suspect char@018: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
5 Bottin1_1827 37 True
6 Bottin1_1827 117 True
7 Bottin1_1827 452 True
8 Bottin1_1827 475 True
9 Bottin1_1837 80 True
10 Bottin1_1837 114 True
11 Bottin1_1837 203 True
12 Bottin1_1837 345 True
13 Bottin1_1837 663 True
14 Bottin1_1837 667 True
Suspect char@013: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Suspect char@029: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
15 Bottin3_1854a 72 True
16 Bottin3_1854a 74 True
17 Bottin3_1854a 200 True
18 Bottin3_1854a 238 True
21 Bottin3_1854a 892 True
22 Bottin3_1854a 1049 True
Suspect char@038: с (0x441 CYRILLIC SMALL LETTER ES -- cat.: Ll)
Suspect char@039: а (0x430 CYRILLIC SMALL LETTER A -- cat.: Ll)
Suspect char@041: р (0x440 CYRILLIC SMALL LETTER ER -- cat.: Ll)
Suspect char@042: о (0x43e CYRILLIC SMALL LETTER O -- cat.: Ll)
Suspect char@044: х (0x445 CYRILLIC SMALL LETTER HA -- cat.: Ll)
Suspect char@086: с (0x441 CYRILLIC SMALL LETTER ES -- cat.: Ll)
Suspect char@087: а (0x430 CYRILLIC SMALL LETTER A -- cat.: Ll)
Suspect char@089: р (0x440 CYRILLIC SMALL LETTER ER -- cat.: Ll)
Suspect char@090: о (0x43e CYRILLIC SMALL LETTER O -- cat.: Ll)
Suspect char@092: х (0x445 CYRILLIC SMALL LETTER HA -- cat.: Ll)
23 Cambon_almgene_1841 141 True
24 Cambon_almgene_1841 301 True
25 Cambon_almgene_1841 330 True
26 Cambon_almgene_1841 375 True
27 Cambon_almgene_1841 418 True
28 Cambon_almgene_1841 487 True
29 Deflandre_1828 278 True
30 Deflandre_1828 310 True
31 Deflandre_1828 881 True
32 Deflandre_1828 937 True
33 Deflandre_1829 505 True
34 Deflandre_1829 743 True
35 Deflandre_1829 949 True
36 Deflandre_1829 1026 True
37 Didot_1841a 162 True
38 Didot_1841a 183 True
39 Didot_1841a 206 True
40 Didot_1841a 316 True
41 Didot_1841a 500 True
Suspect char@034: р (0x440 CYRILLIC SMALL LETTER ER -- cat.: Ll)
Suspect char@035: о (0x43e CYRILLIC SMALL LETTER O -- cat.: Ll)
Suspect char@061: р (0x440 CYRILLIC SMALL LETTER ER -- cat.: Ll)
Suspect char@062: о (0x43e CYRILLIC SMALL LETTER O -- cat.: Ll)
42 Didot_1841a 542 True
Suspect char@009: С (0x421 CYRILLIC CAPITAL LETTER ES -- cat.: Lu)
Suspect char@020: С (0x421 CYRILLIC CAPITAL LETTER ES -- cat.: Lu)
43 Didot_1851a 92 True
44 Didot_1851a 169 True
45 Didot_1851a 226 True
46 Didot_1851a 415 True
47 Didot_1851a 419 True
48 Didot_1851a 639 True
49 Didot_1851a 698 True
52 Didot_1854a 83 True
53 Didot_1854a 326 True
Suspect char@112: ٭ (0x66d ARABIC FIVE POINTED STAR -- cat.: Po)
54 Didot_1854a 607 True
58 DidotBottin_1860a 186 True
59 DidotBottin_1860a 280 True
60 DidotBottin_1861a 238 True
61 DidotBottin_1861a 424 True
62 Duverneuil_et_La_Tynna_1801 260 True
63 Duverneuil_et_La_Tynna_1801 371 True
64 Duverneuil_et_La_Tynna_1801 401 True
65 Duverneuil_et_La_Tynna_1801 415 True
66 Duverneuil_et_La_Tynna_1801 454 True
67 Duverneuil_et_La_Tynna_1805 193 True
68 Duverneuil_et_La_Tynna_1805 250 True
Suspect char@004: ˘ (0x2d8 BREVE -- cat.: Sk)
Suspect char@032: ˘ (0x2d8 BREVE -- cat.: Sk)
69 Duverneuil_et_La_Tynna_1805 251 True
70 Duverneuil_et_La_Tynna_1805 292 True
71 Duverneuil_et_La_Tynna_1805 305 True
72 Duverneuil_et_La_Tynna_1806 147 True
73 Duverneuil_et_La_Tynna_1806 220 True
76 Favre_et_Duchesne_1798 375 True
77 Favre_et_Duchesne_1798 429 True
78 Favre_et_Duchesne_1798 625 True
79 Favre_et_Duchesne_1798 658 True
80 Favre_et_Duchesne_1798 700 True
81 Favre_et_Duchesne_1798 701 True
Suspect char@010: В (0x412 CYRILLIC CAPITAL LETTER VE -- cat.: Lu)
Suspect char@015: В (0x412 CYRILLIC CAPITAL LETTER VE -- cat.: Lu)
82 La_Tynna_1813 163 True
83 La_Tynna_1813 166 True
84 La_Tynna_1813 346 True
85 La_Tynna_1813 377 True
86 notables_communaux_seine_1801 57 True
87 notables_communaux_seine_1801 144 True
```

## Fix bad chars before reimport
This is idempotent and was run before relaunching the notebook.

In [10]:
# uncorrected OCR char
!parallel sed -i s/ƒ/f/g ::: $json_dir/0004.json $json_dir/0014.json

In [11]:
# wrong detection, this is a capital B over black circle
!sed -i s/Ⓡ/Ⓑ/g $json_dir/0021.json

In [12]:
# mixed scripts (cyrillic and latin) in OCR output
!sed -i s/Quinсаmроiх/Quincampoix/g $json_dir/0022.json

In [13]:
# mixed scripts (cyrillic and latin) in OCR output
!sed -i s/роix/poix/g $json_dir/0041.json

In [14]:
# mixed scripts (cyrillic and latin) in OCR output
!sed -i 's/С. F. 1839./C. F. 1839./g'  $json_dir/0042.json

In [15]:
# wrong chars
!sed -i 's/★/*/g;s/٭/*/g' $json_dir/0053.json

In [16]:
# noise (Sk diacritics)
!sed -i "s/˘//g"  $json_dir/0068.json

In [17]:
# mixed scripts (cyrillic and latin) in OCR output
!sed -i 's/Mallet (J.В.)/Mallet (J.B.)/g' $json_dir/0081.json

## Export

In [22]:
len(dataset)

8772

In [24]:
dataset[:3]

[{'id': 286,
  'box': [127.59990697081633,
   319.9430680456292,
   385.6102733261905,
   38.49245993571395],
  'book': 'Bottin1_1820',
  'page': 107,
  'text_ocr': "Dufan et Clémendot, pharmaciens, r. de la\nChaussée-d'Antin, 34. (Elig.) 449",
  'ner_xml': '<PER>Dufan et Clémendot</PER>, <ACT>pharmaciens</ACT>, <LOC>r. de la\u2029Chaussée-d&apos;Antin</LOC>, <CARDINAL>34</CARDINAL>. <TITRE>(Elig.)</TITRE> 449'},
 {'id': 290,
  'box': [125.49036875669651, 357.68003651377927, 386.0, 41.88096136047642],
  'book': 'Bottin1_1820',
  'page': 107,
  'text_ocr': 'Dufant (Victor), libraire, r. du Gros-Che-\nnet, 2. 392',
  'ner_xml': '<PER>Dufant (Victor)</PER>, <ACT>libraire</ACT>, <LOC>r. du Gros-Che-\u2029net</LOC>, <CARDINAL>2</CARDINAL>. 392'},
 {'id': 292,
  'box': [127.0, 398.61236450520784, 387.0, 40.387635494792164],
  'book': 'Bottin1_1820',
  'page': 107,
  'text_ocr': 'Dufay, essayeur du commerce, place Dau-\nphine, 5.         355',
  'ner_xml': '<PER>Dufay</PER>, <ACT>essayeur du 

In [25]:
out_path = os.path.join(sup_dir, "10-ref-ocr-ner-json", "all.json")
out_path

'/home/joseph/git_github/soduco/paper-ner-bench-das22/dataset/supervised/10-ref-ocr-ner-json/all.json'

In [28]:
with open(out_path, 'w') as out_file:
    json.dump(dataset, out_file, indent=2)

In [29]:
!head $out_path

[
  {
    "id": 286,
    "box": [
      127.59990697081633,
      319.9430680456292,
      385.6102733261905,
      38.49245993571395
    ],
    "book": "Bottin1_1820",


In [30]:
out_path_csv = os.path.join(sup_dir, "10-ref-ocr-ner-json", "gold.csv")

In [39]:
lines_to_write = []
for entry in dataset:
    line = '"' + entry['ner_xml'] + '", "' + entry['book'] + '"'
    lines_to_write.append(line)
all_lines = "\n".join(lines_to_write)

In [40]:
import io

In [41]:
with io.open(out_path_csv, "wt", encoding="UTF-8", newline='',
            errors="strict") as file_output:
    file_output.write(all_lines)

In [42]:
!head $out_path_csv

"<PER>Dufan et Clémendot</PER>, <ACT>pharmaciens</ACT>, <LOC>r. de la Chaussée-d&apos;Antin</LOC>, <CARDINAL>34</CARDINAL>. <TITRE>(Elig.)</TITRE> 449", "Bottin1_1820"
"<PER>Dufant (Victor)</PER>, <ACT>libraire</ACT>, <LOC>r. du Gros-Che- net</LOC>, <CARDINAL>2</CARDINAL>. 392", "Bottin1_1820"
"<PER>Dufay</PER>, <ACT>essayeur du commerce</ACT>, <LOC>place Dau- phine</LOC>, <CARDINAL>5</CARDINAL>.         355", "Bottin1_1820"
"<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LOC>r. du Pont- aux Choux</LOC>, <CARDINAL>15</CARDINAL>. 314", "Bottin1_1820"
"<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>, <LOC>r. du Faub.-S. Denis</LOC>, <CARDINAL>20</CARDINAL>. 372", "Bottin1_1820"
"<PER>Dufay</PER>, <ACT>papetier</ACT>, <LOC>r. S.-Martin</LOC>, <CARDINAL>20</CARDINAL>. <CARDINAL>437</CARDINAL>", "Bottin1_1820"
"<PER>Dufeu</PER>, <ACT>charcutier</ACT>, <LOC>r. Montmartre</LOC>, <CARDINAL>89</CARDINAL>. 318", "Bottin1_1820"
"<PER>Dufey fils</PER>, <ACT>bijoutier</ACT>, <LOC>passage de la Réunion</

**WARNING** we have much more escaped XML entities now than in previous gold version.