# 00 - Datasets generation

Prepares all train, dev and test sets for Spacy and the tranformers models for both NER experiments (section 5 and 6).
This notebook should be executed first and foremost.

The sets will be saved ont the disk:
- In `01-experiment_1_prepared_datasets`: train, dev & test datasets for each size of training sets for experiment 1
- In `02-experiment_2_prepared_datasets`: train, dev & test datasets for clean & noisy OCR data (Pero-OCR, Tesseract) for experiment 2 

## Initialisation
The initialisation step:
- sets the random seed SPLIT_SEED to use in all training set generation to ensure repeatable results
- creates logger named nerlogger
- defines the paths to the directories used by the NER notebooks
- imports all the modules used in this notebook

In [1]:
""" Loads the configuration """

# Set to 1/true/ to set the logging level of nerlogger to DEBUG 
# and save the the spacy datasets as TXT along with the .spacy file
#  for easier debug of the training set generation.
%env DEBUG=1

# If True, activates a set of assertions in the notebooks to ensure
# that the scripts runs with the parameters used in the paper.
%env AS_IN_THE_PAPER = True

import util.config as config

config.show()


18/05/2022 02:39:22 ; INFO ; BASEDIR: /home/bertrand/paper-ner-bench-das22
18/05/2022 02:39:22 ; INFO ; Input datasets will be loaded from DATASETDIR /home/bertrand/paper-ner-bench-das22/dataset
18/05/2022 02:39:22 ; INFO ; Training data and models will be saved to NERDIR /home/bertrand/paper-ner-bench-das22/src/ner
18/05/2022 02:39:22 ; INFO ; Debug mode is ON
18/05/2022 02:39:22 ; INFO ; Random seed: 42
18/05/2022 02:39:22 ; INFO ; Enable reproducibility checks: True


env: DEBUG=1
env: AS_IN_THE_PAPER=True


In [2]:
""" Import all modules at once """

# General imports
import numpy as np
import pandas as pd
import csv

# NER imports
from util.dataset_util import train_dev_test_split, unwrap, save_dataset
from sklearn.model_selection import train_test_split
from util.as_in_the_paper import assert_expected


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/bertrand/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 01. Experiment #1

Generates the training, development and test sets for the Spacy and transformers models as described in the subsection #5 "Training and evaluation protocol" of the paper.

In [13]:
""" Loads the input dataset from DATASETDIR. """

GOLD_REFERENCE_PATH = config.DATASETDIR / "supervised/10-ref-ocr-ner-json/gold.csv"

assert GOLD_REFERENCE_PATH.exists()

gold_reference = pd.read_csv(GOLD_REFERENCE_PATH, 
                             header=None,
                             names=["ner_xml", "book"],
                             skipinitialspace='True')

assert_expected(len(gold_reference), 8772)

gold_reference

Unnamed: 0,ner_xml,book
0,"<PER>Dufan et Clémendot</PER>, <ACT>pharmacien...",Bottin1_1820
1,"<PER>Dufant (Victor)</PER>, <ACT>libraire</ACT...",Bottin1_1820
2,"<PER>Dufay</PER>, <ACT>essayeur du commerce</A...",Bottin1_1820
3,"<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LO...",Bottin1_1820
4,"<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...",Bottin1_1820
...,...,...
8767,"<PER>Lamarche</PER>, <ACT>géographe</ACT> , <L...",Notables_communaux_seine_1801
8768,"<PER>Lamarck</PER>, <ACT>membre de l&apos;inst...",Notables_communaux_seine_1801
8769,"<PER>Lamare</PER>, <ACT>notaire</ACT>, <LOC>ru...",Notables_communaux_seine_1801
8770,"<PER>Lamarre</PER> , <ACT>carrier</ACT>, <LOC>...",Notables_communaux_seine_1801


In [6]:
""" Actually generates the sets. """

# Do not create training sets smaller than this.
# You can adjust this value to your convenance but the training processes might
# complain.
# Do not change it if you want to reproduce the results from the article.
MIN_TRAINSET_SIZE = 30

# Split 72/8/20% w. stratified sampling on directories names
train, dev, test = train_dev_test_split(gold_reference.to_numpy())

# Iteratively split the trainset in half to create smaller trainsets
exp1_trainsets = [train]
t_len = len(train)

while True:
    try:
        current = exp1_trainsets[-1]
        _, groups = unwrap(current)
        smaller, rest = train_test_split(
            current,
            train_size=0.5,
            shuffle=True,
            random_state=config.SPLIT_SEED,
            stratify=groups,
        )
        t_len = len(rest)
        if t_len < MIN_TRAINSET_SIZE:
            break
        exp1_trainsets.append(smaller)

    except ValueError:
        # Stop now if we encounter the error "The least populated class in y has only 1 member".
        break


trainset_sizes = [len(s) for s in exp1_trainsets]

config.logger.debug(f"Experiment #1 trainsets sizes: {trainset_sizes}")
config.logger.debug(f"Experiment #1 dev set size: {len(dev)}")
config.logger.debug(f"Experiment #1 test set size: {len(test)}")

# "AS IN PAPER" checks. Apply only if config.AS_IN_THE_PAPER is true
# - number of samples in the full trainset
assert_expected(6373, len(train))

# - number of samples in the subsets of the trainset set
actual = "[6373, 3186, 1593, 796, 398, 199, 99, 49]"
expected = str(trainset_sizes)
assert_expected(actual, expected)

# - number of samples in the dev set
assert_expected(709, len(dev))

# - number of samples in the test set
assert_expected(1690, len(test))

18/05/2022 02:40:48 ; DEBUG ; Experiment #1 trainsets sizes: [6373, 3186, 1593, 796, 398, 199, 99, 49]
18/05/2022 02:40:48 ; DEBUG ; Experiment #1 dev set size: 709
18/05/2022 02:40:48 ; DEBUG ; Experiment #1 test set size: 1690


In [7]:
""" Save the generated sets on the disk. """

output_directory = config.NERDIR / "01-experiment_1_prepared_datasets"

# Create the output directory if necessary
output_directory.mkdir(exist_ok=True, parents=True)
   
for train in exp1_trainsets:
    datasets = [train, dev, test]
    names    = ["train","dev","test"]
    config.logger.info(f"Saving dataset with training set of size {len(train)} to {output_directory}")
    save_dataset(output_directory, datasets, names, suffix=len(train))

18/05/2022 02:40:56 ; INFO ; Saving dataset with training set of size 6373 to /home/bertrand/paper-ner-bench-das22/src/ner/01-experiment_1_prepared_datasets
100%|██████████| 7/7 [00:00<00:00, 25.82ba/s]
100%|██████████| 1/1 [00:00<00:00, 30.17ba/s]
100%|██████████| 2/2 [00:00<00:00, 27.81ba/s]
18/05/2022 02:41:04 ; INFO ; Saving dataset with training set of size 3186 to /home/bertrand/paper-ner-bench-das22/src/ner/01-experiment_1_prepared_datasets
100%|██████████| 4/4 [00:00<00:00, 28.88ba/s]
100%|██████████| 1/1 [00:00<00:00, 31.75ba/s]
100%|██████████| 2/2 [00:00<00:00, 26.41ba/s]
18/05/2022 02:41:08 ; INFO ; Saving dataset with training set of size 1593 to /home/bertrand/paper-ner-bench-das22/src/ner/01-experiment_1_prepared_datasets
100%|██████████| 2/2 [00:00<00:00, 29.69ba/s]
100%|██████████| 1/1 [00:00<00:00, 33.01ba/s]
100%|██████████| 2/2 [00:00<00:00, 26.94ba/s]
18/05/2022 02:41:11 ; INFO ; Saving dataset with training set of size 796 to /home/bertrand/paper-ner-bench-das22/s

# 02. Experiment #2

Generates the training, development and test sets for the Spacy and transformers models as described in the subsection #6 "Training and evaluation protocol" of the paper.

In [14]:
""" Loads the gold referernce dataset from DATASETDIR. """

GOLD_REFERENCE_PATH = config.DATASETDIR / "supervised/40-ner_aligned_valid_subset/gold.csv"
assert GOLD_REFERENCE_PATH.exists()

gold_reference = pd.read_csv(GOLD_REFERENCE_PATH, skipinitialspace='True')

assert_expected(len(gold_reference), 8341)

gold_reference

Unnamed: 0,ner_xml_ref,ner_xml_pero,ner_xml_tess,book
0,"<PER>Dufant (Victor)</PER>, <ACT>libraire</ACT...","☞ T <PER>Dufant (Victor)</PER>, <ACT>libraire...","<PER>Dofaut (Victor)</PER>, <ACT>Sbraire</ACT>...",Bottin1_1820
1,"<PER>Dufay</PER>, <ACT>essayeur du commerce</A...","<PER>Dutay</PER>, <ACT>essayeur du commerce</A...","<PER>Dufay</PER>, <ACT>essayeur du commerce</A...",Bottin1_1820
2,"<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LO...","<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <LO...","<PER>Dufay</PER>, <ACT>chandronnier</ACT>, <LO...",Bottin1_1820
3,"<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...","<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...","&quot;<PER>Dufay (V.e)</PER>, <ACT>grenetière<...",Bottin1_1820
4,"<PER>Dufeu</PER>, <ACT>charcutier</ACT>, <LOC>...","Y ☞ <PER>Dnten</PER>,<ACT>charentier</ACT>, <L...","<PER>Dufen</PER> . <ACT>chareutier</ACT>, <LOC...",Bottin1_1820
...,...,...,...,...
8336,"<PER>Lamarche</PER>, <ACT>géographe</ACT> , <L...","<PER>Lamarche</PER>, <ACT>geographe</ACT> , <L...","<PER>Lamarche</PER>, <ACT>geographe</ACT> , <L...",Notables_communaux_seine_1801
8337,"<PER>Lamarck</PER>, <ACT>membre de l&apos;inst...","<PER>Lamarck</PER>, <ACT>membre de l&apos;inst...","<PER>Lamarck</PER>, <ACT>membre de l&apos;inst...",Notables_communaux_seine_1801
8338,"<PER>Lamare</PER>, <ACT>notaire</ACT>, <LOC>ru...","<PER>Lamare</PER> , <ACT>notaire</ACT>, <LOC>r...","<PER>Lamare</PER> , <ACT>notaire</ACT>, <LOC>r...",Notables_communaux_seine_1801
8339,"<PER>Lamarre</PER> , <ACT>carrier</ACT>, <LOC>...","<PER>Lamarre</PER>, <ACT>carrier</ACT>, <LOC>r...","<PER>Lamarre</PER> , <ACT>Carrier</ACT>, <LOC>...",Notables_communaux_seine_1801


In [16]:
""" Creates the training sets. """

# Reference gold (manually annotated & corrected examples): Split 72/8/20% w. stratified sampling on directories names
ref = gold_reference[["ner_xml_ref","book"]]
train_ref, dev_ref, test_ref = train_dev_test_split(ref.to_numpy())

assert_expected(6004, len(train_ref))
assert_expected(668, len(dev_ref))
assert_expected(1669, len(test_ref))

# Pero-OCR gold: split 72/8/20% w. stratified sampling on directories names
pero = gold_reference[["ner_xml_pero","book"]]
train_pero, dev_pero, test_pero = train_dev_test_split(pero.to_numpy())

assert_expected(6004, len(train_pero))
assert_expected(668, len(dev_pero))
assert_expected(1669, len(test_pero))

# Tesseract gold: split 72/8/20% w. stratified sampling on directories names
tess = gold_reference[["ner_xml_tess","book"]]
train_tess, dev_tess, test_tess = train_dev_test_split(tess.to_numpy())

assert_expected(6004, len(train_tess))
assert_expected(668, len(dev_tess))
assert_expected(1669, len(test_tess))

In [19]:
""" Save the generated sets on the disk. """

output_directory = config.NERDIR / "02-experiment_2_prepared_datasets"

# Create the output directory if necessary
output_directory.mkdir(exist_ok=True, parents=True)
   
names = ["train", "dev", "test"]
save_dataset(output_directory, [train_ref, dev_ref, test_ref], names, suffix="ref")
save_dataset(output_directory, [train_pero, dev_pero, test_pero], names, suffix="pero")
save_dataset(output_directory, [train_tess, dev_tess, test_tess], names, suffix="tess")


100%|██████████| 7/7 [00:00<00:00, 27.23ba/s]
100%|██████████| 1/1 [00:00<00:00, 34.59ba/s]
100%|██████████| 2/2 [00:00<00:00, 27.89ba/s]
100%|██████████| 7/7 [00:00<00:00, 28.32ba/s]
100%|██████████| 1/1 [00:00<00:00, 35.79ba/s]
100%|██████████| 2/2 [00:00<00:00, 29.23ba/s]
100%|██████████| 7/7 [00:00<00:00, 27.35ba/s]
100%|██████████| 1/1 [00:00<00:00, 32.77ba/s]
100%|██████████| 2/2 [00:00<00:00, 27.27ba/s]
