# Dataset preparation

In [1]:
import os.path

In [2]:
DATASET_PATH = os.path.abspath("../../dataset")
DATASET_PATH

'/home/joseph/git_github/soduco/paper-ner-bench-das22/dataset'

# Unsupervised pre-training set

In [3]:
unsup_path = os.path.join(DATASET_PATH, "unsupervised_pretraining")
unsup_path

'/home/joseph/git_github/soduco/paper-ner-bench-das22/dataset/unsupervised_pretraining'

In [4]:
files =  !find $unsup_path/00-raw_json -name "*.json"
len(files)

6887

In [5]:
import json

In [6]:
files[0]

'/home/joseph/git_github/soduco/paper-ner-bench-das22/dataset/unsupervised_pretraining/00-raw_json/Duverneuil_et_La_Tynna_1805-234.json'

In [7]:
import re

In [8]:
book_page_re = r'.*/([^/]*)-([0-9]+)\.json'
re.match(book_page_re, files[0]).groups()

('Duverneuil_et_La_Tynna_1805', '234')

In [9]:
content = None
with open(files[0]) as infile:
    content = json.load(infile)
if not isinstance(content, list):
    content = content["content"]
book, page = re.match(book_page_re, files[0]).groups()
print(book, page)
limit = 10
for ii, element in enumerate(content):
    if ii > limit:
        break
    if element["type"] == "ENTRY":
        for k in ("id", "box", "text_ocr"):
            print(f"\t{k}\t{element[k]}")

Duverneuil_et_La_Tynna_1805 234
	id	261
	box	[144, 133, 42, 16]
	text_ocr	
	id	262
	box	[138, 133, 722, 68]
	text_ocr	Negoctans.
116
Oberkampf (prop. de la manuf. de Jouy), Poitier (J. J. X.). ( en clouterie , armes
et alun de Liége), R. S. Denis, 5. —
pied-à-terre R. de Choiseul. 714. Lep.
	id	263
	box	[138, 199, 479, 19]
	text_ocr	Odent et comp
fab. à Courtalin , dep. de
Lombards.
	id	264
	box	[135, 219, 724, 78]
	text_ocr	Seine et Marne), maison de com. à l'aris,
R. de Seine S. Germain, 49 1403.
Ogilvie, R. et Div. de la Place Ven-
dôme , 4—203.
Pommery et fils , (en épiceries) , R. et Di-
vis. des Lombards , 37, et à la Villette ,
pour les eaux-de-vie.
Pons et comp. , R. du Faub. S. Laurent,
	id	265
	box	[133, 278, 498, 39]
	text_ocr	dome
, 4—203.
Ollivier ( A. C. ), Outrequin R. du Gros
Pons et comp.
187.—Nord.
	id	266
	box	[153, 317, 163, 15]
	text_ocr	Chenet, 11. —
Br


In [10]:
from text_utils import simplify_unicode_charset, check_alignment_charset

In [11]:
dataset = []
allrawstr = []
allnewstr = []
limit = 10000000
count = 0
for file in files:
    content = None
    with open(file) as infile:
        content = json.load(infile)
    if not isinstance(content, list):
        content = content["content"]
    book, page = re.match(book_page_re, file).groups()
    for elem in content:
        count += 1
        if count > limit:
            print("!!!!!!!!! REACHED LIMIT !!!!!!!!!")
            break
        if elem["type"] != "ENTRY":
            continue
        newelem = {}
        for k in ["id", "box"]:
            newelem[k] = elem[k]
        newelem["book"] = book
        newelem["page"] = page
        orig_txt = elem["text_ocr"]
        new_txt = simplify_unicode_charset(orig_txt.strip())
        newelem["text_ocr"] = new_txt
        if not check_alignment_charset(new_txt, strict=True):  # discard bad entries
            print(f"Bad charset for {book}/{page}/{elem['id']}")
            continue
        dataset.append(newelem)
        allrawstr.append(orig_txt)
        allnewstr.append(new_txt)

        

Suspect char@010: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Bad charset for Bottin1_1837/209/287
Suspect char@010: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Bad charset for Bottin1_1837/209/289
Suspect char@012: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Bad charset for Bottin1_1837/209/326
Suspect char@016: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Bad charset for Bottin1_1837/209/330
Suspect char@020: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Bad charset for Bottin1_1837/209/332
Suspect char@020: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Bad charset for Bottin1_1837/209/363
Suspect char@056: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Bad charset for Bottin1_1827/111/272
Suspect char@015: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Bad charset for Bottin1_1827/111/299
Suspect char@014: ƒ (0x192 LATIN SMALL LETTER F WITH HOOK -- cat.: Ll)
Bad charset for Bottin1_1837/29/324
Suspect char@009: ƒ (0x192 LA

In [12]:
check_alignment_charset("\n".join(allrawstr), dump_charset=True)

Suspect char@3973: „ (0x201e DOUBLE LOW-9 QUOTATION MARK -- cat.: Ps)
Suspect char@10417: „ (0x201e DOUBLE LOW-9 QUOTATION MARK -- cat.: Ps)
Suspect char@13838: ’ (0x2019 RIGHT SINGLE QUOTATION MARK -- cat.: Pf)
Suspect char@24020: ’ (0x2019 RIGHT SINGLE QUOTATION MARK -- cat.: Pf)
Suspect char@24263: „ (0x201e DOUBLE LOW-9 QUOTATION MARK -- cat.: Ps)
Suspect char@26557: „ (0x201e DOUBLE LOW-9 QUOTATION MARK -- cat.: Ps)
Suspect char@34504: „ (0x201e DOUBLE LOW-9 QUOTATION MARK -- cat.: Ps)
Suspect char@43040: „ (0x201e DOUBLE LOW-9 QUOTATION MARK -- cat.: Ps)
Suspect char@54479: „ (0x201e DOUBLE LOW-9 QUOTATION MARK -- cat.: Ps)
Suspect char@63829: “ (0x201c LEFT DOUBLE QUOTATION MARK -- cat.: Pi)
Suspect char@78330: „ (0x201e DOUBLE LOW-9 QUOTATION MARK -- cat.: Ps)
Suspect char@84719: „ (0x201e DOUBLE LOW-9 QUOTATION MARK -- cat.: Ps)
Suspect char@95325: „ (0x201e DOUBLE LOW-9 QUOTATION MARK -- cat.: Ps)
Suspect char@99425: „ (0x201e DOUBLE LOW-9 QUOTATION MARK -- cat.: Ps)
Suspect 

True

In [13]:
check_alignment_charset("\n".join(allnewstr), dump_charset=True)

Charset:
    # | Char
  ----+---------------------
  5769203 |   (0x20 SPACE -- cat.: Zs)
  5090689 | e (0x65 LATIN SMALL LETTER E -- cat.: Ll)
  3074810 | r (0x72 LATIN SMALL LETTER R -- cat.: Ll)
  2908262 | , (0x2c COMMA -- cat.: Po)
  2684136 | i (0x69 LATIN SMALL LETTER I -- cat.: Ll)
  2670793 | a (0x61 LATIN SMALL LETTER A -- cat.: Ll)
  2303413 | t (0x74 LATIN SMALL LETTER T -- cat.: Ll)
  2298997 | n (0x6e LATIN SMALL LETTER N -- cat.: Ll)
  2146637 | s (0x73 LATIN SMALL LETTER S -- cat.: Ll)
  2055060 | o (0x6f LATIN SMALL LETTER O -- cat.: Ll)
  1928460 | '\n' (0xa '\n' -- cat.: Cc)
  1913196 | u (0x75 LATIN SMALL LETTER U -- cat.: Ll)
  1884020 | . (0x2e FULL STOP -- cat.: Po)
  1765959 | l (0x6c LATIN SMALL LETTER L -- cat.: Ll)
  1150715 | d (0x64 LATIN SMALL LETTER D -- cat.: Ll)
  1052236 | c (0x63 LATIN SMALL LETTER C -- cat.: Ll)
  991184 | - (0x2d HYPHEN-MINUS -- cat.: Pd)
  767635 | m (0x6d LATIN SMALL LETTER M -- cat.: Ll)
  712347 | p (0x70 LATIN SMALL LETTER P --

True

In [14]:
dataset[:10]

[{'id': 261,
  'box': [144, 133, 42, 16],
  'book': 'Duverneuil_et_La_Tynna_1805',
  'page': '234',
  'text_ocr': ''},
 {'id': 262,
  'box': [138, 133, 722, 68],
  'book': 'Duverneuil_et_La_Tynna_1805',
  'page': '234',
  'text_ocr': 'Negoctans.\n116\nOberkampf (prop. de la manuf. de Jouy), Poitier (J. J. X.). ( en clouterie , armes\net alun de Liége), R. S. Denis, 5. —\npied-à-terre R. de Choiseul. 714. Lep.'},
 {'id': 263,
  'box': [138, 199, 479, 19],
  'book': 'Duverneuil_et_La_Tynna_1805',
  'page': '234',
  'text_ocr': 'Odent et comp\nfab. à Courtalin , dep. de\nLombards.'},
 {'id': 264,
  'box': [135, 219, 724, 78],
  'book': 'Duverneuil_et_La_Tynna_1805',
  'page': '234',
  'text_ocr': "Seine et Marne), maison de com. à l'aris,\nR. de Seine S. Germain, 49 1403.\nOgilvie, R. et Div. de la Place Ven-\ndôme , 4—203.\nPommery et fils , (en épiceries) , R. et Di-\nvis. des Lombards , 37, et à la Villette ,\npour les eaux-de-vie.\nPons et comp. , R. du Faub. S. Laurent,"},
 {'id': 26

In [15]:
len(dataset)

1058196

In [16]:
output_path = os.path.join(unsup_path, "10-normalized", "all.json")
with open(output_path, "w") as outfile:
    json.dump(dataset, outfile, indent=2)

In [17]:
!head $output_path

[
  {
    "id": 261,
    "box": [
      144,
      133,
      42,
      16
    ],
    "book": "Duverneuil_et_La_Tynna_1805",


In [18]:
str_export = []
for ss in allnewstr:
    if len(ss) < 10:
        continue
    # we could also detect wrong scripts and discard…
    str_export.append(ss.replace("\n", " "))

## Create text file

In [19]:
import io

In [20]:
out_path = os.path.join(unsup_path, "10-normalized", "all.txt")
with io.open(out_path, "wt", encoding="UTF-8", newline='',
            errors="strict") as file_output:
# output lines are utf-8-encoded and have LF EOL
    file_output.write("\n".join(str_export))

In [21]:
!head $out_path

Negoctans. 116 Oberkampf (prop. de la manuf. de Jouy), Poitier (J. J. X.). ( en clouterie , armes et alun de Liége), R. S. Denis, 5. — pied-à-terre R. de Choiseul. 714. Lep.
Odent et comp fab. à Courtalin , dep. de Lombards.
Seine et Marne), maison de com. à l'aris, R. de Seine S. Germain, 49 1403. Ogilvie, R. et Div. de la Place Ven- dôme , 4—203. Pommery et fils , (en épiceries) , R. et Di- vis. des Lombards , 37, et à la Villette , pour les eaux-de-vie. Pons et comp. , R. du Faub. S. Laurent,
dome , 4—203. Ollivier ( A. C. ), Outrequin R. du Gros Pons et comp. 187.—Nord.
Chenet, 11. — Br
Chenet , 11. — Br. Olry et comp , R. de Provence, 53. — Mont Bl. Porchet (Guillaume) , R. Cérutty, 33. Poréet (en mousselines ). R. Thibautodé , 10 Muačum.
Mont Bl. Oudot neven, R. Serpente, 6.—Th Fr.  19.—Muséum. Portarieu , R. Mêlée . 35.—Gravil.
Oudot neveu, R. Serpente, 6.—Th Fr. Paillard frères ( dépot de la manufact. de de fayence de Chantilly), R. de Ménil- Montant, 17. — Popinc. Portarieu , 

In [22]:
!wc -l $out_path

1045673 /home/joseph/git_github/soduco/paper-ner-bench-das22/dataset/unsupervised_pretraining/10-normalized/all.txt
