# 00 - Datasets generation

Outputs:
- In `01-experiment_1_prepared_datasets`: train, dev & test datasets for each size of training sets for experiment 1
- In `02-experiment_2_prepared_datasets`: train, dev & test datasets for clean & noisy OCR data (Pero-OCR, Tesseract) for experiment 2 

## Initialization

In [12]:
import os, sys
from pathlib import Path

ENV_IS_GOOGLE_COLAB = True if 'google.colab' in str(get_ipython()) else False
os.environ["ENV_IS_GOOGLE_COLAB"] = str(ENV_IS_GOOGLE_COLAB)

if ENV_IS_GOOGLE_COLAB:
  from google.colab import drive
  mountpoint = Path("/content/drive")
  drive.mount(str(mountpoint)) # Mount gdrive to BASE
  base = mountpoint / "MyDrive/SODUCO/article_das_2022" # Adapt this to your situation
  sys.path.append(str(base)) # Add BASE to Python Path
else:
  base = os.path.dirname(os.path.realpath("__file__")) # If not on GColab, BASE will be the directory of this notebook

BASE = Path(base).resolve() # Make BASE absolute
print(sys.path)
BASE

['/home/bertrand/dev/paper-ner-bench-das22/src/ner', '/home/bertrand/anaconda3/lib/python39.zip', '/home/bertrand/anaconda3/lib/python3.9', '/home/bertrand/anaconda3/lib/python3.9/lib-dynload', '', '/home/bertrand/anaconda3/lib/python3.9/site-packages', '/home/bertrand/anaconda3/lib/python3.9/site-packages/locket-0.2.1-py3.9.egg', '/home/bertrand/anaconda3/lib/python3.9/site-packages/IPython/extensions', '/home/bertrand/.ipython']


PosixPath('/home/bertrand/dev/paper-ner-bench-das22/src/ner')

## Constants

In [13]:
# GLOBAL CONSTANTS
import config
import os
from pathlib import Path

config.SPLIT_SEED = 42 # Random seed used in train/dev/test. Do not change it if you want to recreate the paper results.
config.DEBUG = False # If true, text versions of the spacy datasets will be savec along with the .spacy files

# INPUT / OUTPUT
OUTPUT_DIR = BASE
OUTPUT_DIR

PosixPath('/home/bertrand/dev/paper-ner-bench-das22/src/ner')

# 01. Experiment #1

In [14]:
import numpy as np
import pandas as pd
import csv

GOLD_REF = DATASET_DIR / "supervised/10-ref-ocr-ner-json/gold.csv"
assert GOLD_REF.exists()

gold_reference = pd.read_csv(GOLD_REF, header=None, names=["ner_xml","book"],skipinitialspace='True')
gold_reference

Unnamed: 0,ner_xml,book
0,"<PER>Dufan et Clémendot</PER>, <ACT>pharmacien...",Bottin1_1820
1,"<PER>Dufant (Victor)</PER>, <ACT>libraire</ACT...",Bottin1_1820
2,"<PER>Dufay</PER>, <ACT>essayeur du commerce</A...",Bottin1_1820
3,"<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LO...",Bottin1_1820
4,"<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...",Bottin1_1820
...,...,...
8767,"<PER>Lamarche</PER>, <ACT>géographe</ACT> , <L...",notables_communaux_seine_1801
8768,"<PER>Lamarck</PER>, <ACT>membre de l&apos;inst...",notables_communaux_seine_1801
8769,"<PER>Lamare</PER>, <ACT>notaire</ACT>, <LOC>ru...",notables_communaux_seine_1801
8770,"<PER>Lamarre</PER> , <ACT>carrier</ACT>, <LOC>...",notables_communaux_seine_1801


In [15]:
from dataset_util import train_dev_test_split, unwrap # Local imports
from sklearn.model_selection import train_test_split

# CONSTANTS
MIN_TRAINSET_SIZE = 30

 # Split 72/8/20% w. stratified sampling on directories names
train, dev, test = train_dev_test_split(gold_reference.to_numpy())

# Iteratively split the trainset in half to create smaller trainsets
exp1_trainsets = [train]
t_len = len(train)

while True:
    try:
        current = exp1_trainsets[-1]
        _, groups = unwrap(current)
        smaller, rest = train_test_split(
            current,
            train_size=0.5,
            shuffle=True,
            random_state=config.SPLIT_SEED,
            stratify=groups,
        )
        t_len = len(rest)
        if t_len < MIN_TRAINSET_SIZE:
            break
        exp1_trainsets.append(smaller)

    except ValueError:
        # Stop now if we encounter the error "The least populated class in y has only 1 member".
        break

[len(s) for s in exp1_trainsets] # Should be 
            

[6373, 3186, 1593, 796, 398, 199, 99, 49]

In [16]:
# Sanity checks

# Dev set should contain 709 examples
assert len(dev) == 709

# Test set should contain 1690 examples
assert len(test) == 1690

# Lenghts of exp1_trainsets should be fixed
assert sorted([len(s) for s in exp1_trainsets] ) == sorted([6373, 3186, 1593, 796, 398, 199, 99, 49])

In [17]:
# Save on disk
from dataset_util import save_dataset # Local import

output_directory = OUTPUT_DIR / "01-experiment_1_prepared_datasets"
output_directory.mkdir(exist_ok=True, parents=True) # Create if necessary
   
for train in exp1_trainsets:
    datasets = [train, dev, test]
    save_dataset(output_directory, datasets, ["train","dev","test"], suffix=len(train))

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

# 02. Experiment #2

In [18]:
import pandas as pd

# Load the Gold dataset
GOLD_REF = DATASET_DIR / "supervised/40-ner_aligned_valid_subset/gold.csv"

assert GOLD_REF.exists()

exp_2_gold_total = pd.read_csv(GOLD_REF, skipinitialspace='True')
exp_2_gold_total

Unnamed: 0,ner_xml_ref,ner_xml_pero,ner_xml_tess,book
0,"<PER>Dufant (Victor)</PER>, <ACT>libraire</ACT...","☞ T <PER>Dufant (Victor)</PER>, <ACT>libraire...","<PER>Dofaut (Victor)</PER>, <ACT>Sbraire</ACT>...",Bottin1_1820
1,"<PER>Dufay</PER>, <ACT>essayeur du commerce</A...","<PER>Dutay</PER>, <ACT>essayeur du commerce</A...","<PER>Dufay</PER>, <ACT>essayeur du commerce</A...",Bottin1_1820
2,"<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LO...","<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LO...","<PER>Dufay</PER>, <ACT>chandronnier</ACT>, <LO...",Bottin1_1820
3,"<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...","<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...","&quot;<PER>Dufay (V.e)</PER>, <ACT>grenetière<...",Bottin1_1820
4,"<PER>Dufeu</PER>, <ACT>charcutier</ACT>, <LOC>...","Y ☞ <PER>Dnten</PER>,<ACT>charentier</ACT>, <L...","<PER>Dufen</PER> . <ACT>chareutier</ACT>, <LOC...",Bottin1_1820
...,...,...,...,...
8336,"<PER>Lamarche</PER>, <ACT>géographe</ACT> , <L...","<PER>Lamarche</PER>, <ACT>geographe</ACT> , <L...","<PER>Lamarche</PER>, <ACT>geographe</ACT> , <L...",notables_communaux_seine_1801
8337,"<PER>Lamarck</PER>, <ACT>membre de l&apos;inst...","<PER>Lamarck</PER>, <ACT>membre de l&apos;inst...","<PER>Lamarck</PER>, <ACT>membre de l&apos;inst...",notables_communaux_seine_1801
8338,"<PER>Lamare</PER>, <ACT>notaire</ACT>, <LOC>ru...","<PER>Lamare</PER> , <ACT>notaire</ACT>, <LOC>r...","<PER>Lamare</PER> , <ACT>notaire</ACT>, <LOC>r...",notables_communaux_seine_1801
8339,"<PER>Lamarre</PER> , <ACT>carrier</ACT>, <LOC>...","<PER>Lamarre</PER>, <ACT>carrier</ACT>, <LOC>r...","<PER>Lamarre</PER> , <ACT>Carrier</ACT>, <LOC>...",notables_communaux_seine_1801


In [19]:
from dataset_util import train_dev_test_split # Local imports

# Reference gold (manually annotated & corrected examples): Split 72/8/20% w. stratified sampling on directories names
ref = exp_2_gold_total[["ner_xml_ref","book"]]
train_ref, dev_ref, test_ref = train_dev_test_split(ref.to_numpy())

# Pero-OCR gold: split 72/8/20% w. stratified sampling on directories names
pero = exp_2_gold_total[["ner_xml_pero","book"]]
train_pero, dev_pero, test_pero = train_dev_test_split(pero.to_numpy())

# Tesseract gold: split 72/8/20% w. stratified sampling on directories names
tess = exp_2_gold_total[["ner_xml_tess","book"]]
train_tess, dev_tess, test_tess = train_dev_test_split(tess.to_numpy())

# Kraken gold: split 72/8/20% w. stratified sampling on directories names
kraken = exp_2_gold_total[["ner_xml_kraken","book"]]
train_kraken, dev_kraken, test_kraken = train_dev_test_split(tess.to_numpy())


In [20]:
# Sanity checks: all train (resp. dev, test) sets must be the exact same size.

assert len(train_ref) == 8342

assert len(train_ref) == len(train_pero) and len(train_ref) == len(train_tess)  and len(train_ref) == len(train_kraken)

assert len(dev_ref) == len(dev_pero) and len(dev_ref) == len(dev_tess) and len(dev_ref) == len(dev_kraken)

assert len(test_ref) == len(test_pero) and len(test_ref) == len(test_tess) and len(test_ref) == len(test_kraken)

In [21]:
# Save on disk
from dataset_util import save_dataset # Local import

output_directory = OUTPUT_DIR / "02-experiment_2_prepared_datasets"
output_directory.mkdir(exist_ok=True, parents=True) # Create if necessary

names = ["train", "dev", "test"]
save_dataset(output_directory, [train_ref, dev_ref, test_ref], names, suffix="ref")
save_dataset(output_directory, [train_pero, dev_pero, test_pero], names, suffix="pero")
save_dataset(output_directory, [train_tess, dev_tess, test_tess], names, suffix="tess")
save_dataset(output_directory, [train_kraken, dev_kraken, test_kraken], names, suffix="kraken")


  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]