# 00 - Datasets generation

This notebook expects the gold datasets to be available in CSV in paper-ner-bench-das22/dataset.
Set the environment variable `INPUT_GOLD_DIR` to change this path.

Outputs:
- In `"00-prepared_datasets/01-experiment_1"`: train, dev & test datasets for each size of training sets for experiment 1
- In `"00-prepared_datasets/02-experiment_2"`: train, dev & test datasets for clean & noisy OCR data (Pero-OCR, Tesseract) for experiment 2 

## 01. Preparation

In [1]:
# GLOBAL CONSTANTS
import config

config.SPLIT_SEED = 42 # Random seed used in train/dev/test. Do not change it if you want to recreate the paper results.
config.DEBUG = False # Enable debug features ?

In [2]:
import os
from pathlib import Path

nb_loc = Path(os.path.dirname(os.path.realpath("__file__"))).resolve()

# INPUT / OUTPUT DIRECTORIES
INPUT_GOLD_DIR_DEFAULT = (nb_loc / "../../dataset").resolve()
INPUT_GOLD_DIR = Path(os.getenv("INPUT_GOLD_DIR", INPUT_GOLD_DIR_DEFAULT))

OUTPUT_DIR = nb_loc / "00-prepared_datasets"

assert INPUT_GOLD_DIR.exists()
INPUT_GOLD_DIR, OUTPUT_DIR

(PosixPath('/home/bertrand/dev/paper-ner-bench-das22/dataset'),
 PosixPath('/home/bertrand/dev/paper-ner-bench-das22/src/ner/00-prepared_datasets'))

In [3]:
import numpy as np
import csv

# Load the Reference, pero-OCR and Tesseract gold datasets
GOLD_REF = INPUT_GOLD_DIR / "supervised/10-ref-ocr-ner-json/gold.csv"
GOLD_PERO = INPUT_GOLD_DIR / "" #TODO 
GOLD_TESS = INPUT_GOLD_DIR / "" #TODO

with open(GOLD_REF, encoding="utf-8") as gf:
    gold_reference = np.array([(_[0], _[1]) for _ in csv.reader(gf)])

assert len(gold_reference)
gold_reference

#TODO: PERO + TESS

array([['<PER>Dufan et Clémendot</PER>, <ACT>pharmaciens</ACT>, <LOC>r. de la\u2029Chaussée-d&apos;Antin</LOC>, <CARDINAL>34</CARDINAL>. <TITRE>(Elig.)</TITRE> 449',
        ' "Bottin1_1820"'],
       ['<PER>Dufant (Victor)</PER>, <ACT>libraire</ACT>, <LOC>r. du Gros-Che-\u2029net</LOC>, <CARDINAL>2</CARDINAL>. 392',
        ' "Bottin1_1820"'],
       ['<PER>Dufay</PER>, <ACT>essayeur du commerce</ACT>, <LOC>place Dau-\u2029phine</LOC>, <CARDINAL>5</CARDINAL>.         355',
        ' "Bottin1_1820"'],
       ...,
       ['<PER>Lamare</PER>, <ACT>notaire</ACT>, <LOC>rue du faubourg honoré</LOC>.',
        ' "notables_communaux_seine_1801"'],
       ['<PER>Lamarre</PER> , <ACT>carrier</ACT>, <LOC>rue mouffetard</LOC>.',
        ' "notables_communaux_seine_1801"'],
       ['<PER>Lamarre</PER>, <ACT>clerc de notaire</ACT>, <LOC>rue égalité</LOC>.',
        ' "notables_communaux_seine_1801"']], dtype='<U855')

# 02. Experiment #1

In [4]:
from dataset_util import train_dev_test_split, unwrap # Local imports
from sklearn.model_selection import train_test_split

# CONSTANTS
MIN_TRAINSET_SIZE = 30

 # Split 72/8/20% w. stratified sampling on directories names
train, dev, test = train_dev_test_split(gold_reference)

# Iteratively split the trainset in half to create smaller trainsets
exp1_trainsets = [train]
t_len = len(train)

while True:
    try:
        current = exp1_trainsets[-1]
        _, groups = unwrap(current)
        smaller, rest = train_test_split(
            current,
            train_size=0.5,
            shuffle=True,
            random_state=config.SPLIT_SEED,
            stratify=groups,
        )
        t_len = len(rest)
        if t_len < MIN_TRAINSET_SIZE:
            break
        exp1_trainsets.append(smaller)

    except ValueError:
        # Stop now if we encounter the error "The least populated class in y has only 1 member".
        break

[len(s) for s in exp1_trainsets] # Should be 
            

[nltk_data] Downloading package punkt to /home/bertrand/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[6373, 3186, 1593, 796, 398, 199, 99, 49]

In [5]:
# Sanity checks

# Dev set should contain 709 examples
assert len(dev) == 709

# Test set should contain 1690 examples
assert len(test) == 1690

# Lenghts of exp1_trainsets should be fixed
assert sorted([len(s) for s in exp1_trainsets] ) == sorted([6373, 3186, 1593, 796, 398, 199, 99, 49])

In [6]:
# Save on disk
from dataset_util import save_dataset # Local import

output_directory = OUTPUT_DIR / "01-experiment_1"
output_directory.mkdir(exist_ok=True, parents=True) # Create if necessary
   
for train in exp1_trainsets:
    datasets = [train, dev, test]
    save_dataset(output_directory, datasets, ["train","dev","test"], suffix=len(train))

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

# 03. Experiment #2

In [7]:
from dataset_util import train_dev_test_split # Local imports

# Reference gold (manually annotated & corrected examples): Split 72/8/20% w. stratified sampling on directories names
train_ref, dev_ref, test_ref = train_dev_test_split(gold_reference)

# Pero-OCR gold: split 72/8/20% w. stratified sampling on directories names
train_pero, dev_pero, test_pero = train_dev_test_split(gold_reference)

# Tesseract gold: split 72/8/20% w. stratified sampling on directories names
train_tess, dev_tess, test_tess = train_dev_test_split(gold_reference)

In [8]:
# Sanity checks: all train (resp. dev, test) sets must be the exact same size.

assert len(train_ref) == len(train_pero) and len(train_ref) == len(train_tess)

assert len(dev_ref) == len(dev_pero) and len(dev_ref) == len(dev_tess)

assert len(test_ref) == len(test_pero) and len(test_ref) == len(test_tess)

In [9]:
# Save on disk
from dataset_util import save_dataset # Local import

output_directory = OUTPUT_DIR / "02-experiment_2"
output_directory.mkdir(exist_ok=True, parents=True) # Create if necessary

names = ["train", "dev", "test"]
save_dataset(output_directory, [train_ref, dev_ref, test_ref], names, suffix="ref")
save_dataset(output_directory, [train_pero, dev_pero, test_pero], names, suffix="pero")
save_dataset(output_directory, [train_tess, dev_tess, test_tess], names, suffix="tess")


  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]