# 00 - Flat NER datasets generation

Execute this notebook twice (set **MODEL_NAME** with *camember_ner* and for *pretrained_camembert_ner*)

**Outputs**
- In `01-experiment_1_prepared_ref_dataset_camembert_ner` : train, dev, test for clean data with CamemBERT NER tokenizer
- In `01-experiment_1_prepared_ref_dataset_pretrained_camembert_ner` : train, dev, test for clean data with Pretrained CamemBERT NER tokenizer
- In `02-experiment_2_prepared_pero_ocr_dataset_camembert_ner` : train, dev & test datasets for noisy OCR data (Pero-OCR) with CamemBERT NER tokenizer
- In `02-experiment_2_prepared_pero_ocr_dataset_pretrained_camembert_ner` : train, dev & test datasets for noisy OCR data (Pero-OCR) with Pretrained CamemBERT NER tokenizer

In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
print(os.environ["CUDA_VISIBLE_DEVICES"])

1


## Initialization

In [2]:
import os, sys
from pathlib import Path

ENV_IS_GOOGLE_COLAB = True if 'google.colab' in str(get_ipython()) else False
os.environ["ENV_IS_GOOGLE_COLAB"] = str(ENV_IS_GOOGLE_COLAB)

if ENV_IS_GOOGLE_COLAB:
  from google.colab import drive
  mountpoint = Path("/content/drive")
  drive.mount(str(mountpoint)) # Mount gdrive to BASE
  base = mountpoint / "MyDrive/article_icdar_2023" # Adapt this to your situation
  sys.path.append(str(base)) # Add BASE to Python Path
  BASE = Path(base).resolve() # Make BASE absolute
  DATASETS =  BASE / "dataset"
else:
  BASE = Path(os.path.dirname(os.path.realpath("__file__"))).resolve() # If not on GColab, BASE will be the directory of this notebook
  DATASETS = Path('/work/stual/dataset_ICDAR').resolve() # ADAPT THIS TO YOUR SITUATION : Your dataset location
  RES_PATH_BASE = '/work/stual/res_ICDAR/' # ADAPT THIS TO YOUR SITUATION : folder where you save your results
  OUT_BASE = Path(RES_PATH_BASE + '/method_0').resolve() 

print(sys.path)
print(BASE)
print(DATASETS)
print(OUT_BASE)

['/lrde/home2/stual/code_ICDAR_review/src/m0_flat_ner', '/usr/lib/python310.zip', '/usr/lib/python3.10', '/usr/lib/python3.10/lib-dynload', '', '/lrde/home2/stual/.local/lib/python3.10/site-packages', '/usr/lib/python3.10/site-packages']
/lrde/home2/stual/code_ICDAR_review/src/m0_flat_ner
/work/stual/dataset_ICDAR
/work/stual/res_ICDAR/method_0


## Constants

In [3]:
# GLOBAL CONSTANTS
import config
import os
from pathlib import Path

config.SPLIT_SEED = 42 # Random seed used in train/dev/test. Do not change it if you want to recreate the paper results.

MODEL_NAME = "pretrained_camembert_ner" #camembert_ner OR pretrained_camembert_ner

# 01. Experiment #1

In [4]:
import numpy as np
import pandas as pd
import csv
GOLD_REF = DATASETS / "41-ner_ref_from_pero/gold.csv"
assert GOLD_REF.exists()

with open(GOLD_REF,'r',encoding='utf8') as f:
    lines = f.readlines()
    res = []
    for line in lines:
        l = line.split('", "')
        res.append([l[0][1:],l[1][:-2]])
gold_reference = pd.DataFrame(res,columns=["ner_xml","book"])

In [5]:
#TITRE-H and TITRE-P labels to transformers NER Pipeline
for i in range(len(gold_reference)):
    if '<TITRE-H>' in gold_reference['ner_xml'][i]:
        gold_reference['ner_xml'][i] = gold_reference['ner_xml'][i].replace('TITRE-H','TITREH')
    if '<TITRE-P>' in gold_reference['ner_xml'][i]:
        gold_reference['ner_xml'][i] = gold_reference['ner_xml'][i].replace('TITRE-P','TITREP')
gold_reference

Unnamed: 0,ner_xml,book
0,"<PER>Dufant (Victor)</PER>, <ACT>libraire</ACT...",Bottin1_1820
1,"<PER>Dufay</PER>, <ACT>essayeur du commerce</A...",Bottin1_1820
2,"<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <SP...",Bottin1_1820
3,"<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...",Bottin1_1820
4,"<PER>Dufeu</PER>, <ACT>charcutier</ACT>, <SPAT...",Bottin1_1820
...,...,...
8440,"<PER>Lamarche</PER>, <ACT>géographe</ACT> , <S...",Notables_communaux_seine_1801
8441,"<PER>Lamarck</PER>, <ACT>membre de l'institut<...",Notables_communaux_seine_1801
8442,"<PER>Lamare</PER>, <ACT>notaire</ACT>, <SPAT><...",Notables_communaux_seine_1801
8443,"<PER>Lamarre</PER> , <ACT>carrier</ACT>, <SPAT...",Notables_communaux_seine_1801


In [6]:
from dataset_util import train_dev_test_split, unwrap # Local imports
from sklearn.model_selection import train_test_split

# CONSTANTS
MIN_TRAINSET_SIZE = 30

 # Split 72/8/20% w. stratified sampling on directories names
train, dev, test = train_dev_test_split(gold_reference.to_numpy())

# Iteratively split the trainset in half to create smaller trainsets
exp1_trainsets = [train]
t_len = len(train)

while True:
    try:
        current = exp1_trainsets[-1]
        _, groups = unwrap(current)
        smaller, rest = train_test_split(
            current,
            train_size=0.5,
            shuffle=True,
            random_state=config.SPLIT_SEED,
            stratify=groups,
        )
        t_len = len(rest)
        if t_len < MIN_TRAINSET_SIZE:
            break
        exp1_trainsets.append(smaller)

    except ValueError:
        # Stop now if we encounter the error "The least populated class in y has only 1 member".
        break

[len(s) for s in exp1_trainsets] # Should be 

2023-01-19 14:34:07.256907: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-19 14:34:08.964264: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-19 14:34:08.964375: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
[nltk_data] Downloading package punkt to
[nltk_data]     /lrde/home2/stual/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[6084, 3042, 1521, 760, 380, 190, 95, 47]

In [7]:
# Sanity checks

# Dev set should contain 676 examples
assert len(dev) == 676

# Test set should contain 1685 examples
assert len(test) == 1685

# Lenghts of exp1_trainsets should be fixed
assert sorted([len(s) for s in exp1_trainsets] ) == sorted([6084, 3042, 1521, 760, 380, 190, 95, 47])

In [8]:
from tools import createStatsTab

createStatsTab(train,dev,test)

Unnamed: 0_level_0,Train,Dev,Test,All
Entity type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PER,6085,676,1685,8446
ACT,4568,519,1094,6181
TITRE,290,42,76,408
LOC,6871,762,1788,9421
FT,55,7,14,76
CARDINAL,5991,678,1751,8420
All,23860,2684,6408,32952


In [8]:
# Save on disk
from dataset_util import save_dataset # Local import

output_directory = OUT_BASE / f"01-experiment_1_prepared_ref_dataset_{MODEL_NAME}"
output_directory.mkdir(exist_ok=True, parents=True) # Create if necessary
   
for train in exp1_trainsets:
    datasets = [train, dev, test]
    save_dataset(output_directory, datasets, ["train","dev","test"], suffix=len(train))

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Saving the dataset (0/1 shards):   0%|          | 0/6084 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/676 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1685 [00:00<?, ? examples/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Saving the dataset (0/1 shards):   0%|          | 0/3042 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/676 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1685 [00:00<?, ? examples/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Saving the dataset (0/1 shards):   0%|          | 0/1521 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/676 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1685 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Saving the dataset (0/1 shards):   0%|          | 0/760 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/676 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1685 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Saving the dataset (0/1 shards):   0%|          | 0/380 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/676 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1685 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Saving the dataset (0/1 shards):   0%|          | 0/190 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/676 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1685 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Saving the dataset (0/1 shards):   0%|          | 0/95 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/676 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1685 [00:00<?, ? examples/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

Saving the dataset (0/1 shards):   0%|          | 0/47 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/676 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1685 [00:00<?, ? examples/s]

# 02. Experiment #2

In [9]:
import numpy as np
import pandas as pd
import csv

GOLD_REF = DATASETS / "31-ner_align_pero/gold.csv"
gold_reference = pd.read_csv(GOLD_REF, header=None, names=["ner_xml","book"],skipinitialspace='True')

In [10]:
#TITRE-H and TITRE-P labels to transformers NER Pipeline
for i in range(len(gold_reference)):
    if '<TITRE-H>' in gold_reference['ner_xml'][i]:
        gold_reference['ner_xml'][i] = gold_reference['ner_xml'][i].replace('TITRE-H','TITREH')
    if '<TITRE-P>' in gold_reference['ner_xml'][i]:
        gold_reference['ner_xml'][i] = gold_reference['ner_xml'][i].replace('TITRE-P','TITREP')
gold_reference

Unnamed: 0,ner_xml,book
0,"☞ T <PER>Dufant (Victor)</PER>, <ACT>libraire...",Bottin1_1820
1,"<PER>Dutay</PER>, <ACT>essayeur du commerce</A...",Bottin1_1820
2,"<PER>Dulay</PER>, <ACT>chandronnier</ACT>, <SP...",Bottin1_1820
3,"<PER>Dufay (V.e)</PER>, <ACT>grenetière</ACT>,...",Bottin1_1820
4,"Y ☞ <PER>Dnten</PER>,<ACT>charentier</ACT>, <S...",Bottin1_1820
...,...,...
8440,"<PER>Lamarche</PER>, <ACT>geographe</ACT> , <S...",Notables_communaux_seine_1801
8441,"<PER>Lamarck</PER>, <ACT>membre de l&apos;inst...",Notables_communaux_seine_1801
8442,"<PER>Lamare</PER> , <ACT>notaire</ACT>, <SPAT>...",Notables_communaux_seine_1801
8443,"<PER>Lamarre</PER>, <ACT>carrier</ACT>, <SPAT>...",Notables_communaux_seine_1801


In [11]:
from dataset_util import train_dev_test_split, unwrap # Local imports
from sklearn.model_selection import train_test_split

# CONSTANTS
MIN_TRAINSET_SIZE = 30

 # Split 72/8/20% w. stratified sampling on directories names
train, dev, test = train_dev_test_split(gold_reference.to_numpy())

# Iteratively split the trainset in half to create smaller trainsets
exp1_trainsets = [train]
t_len = len(train)

while True:
    try:
        current = exp1_trainsets[-1]
        _, groups = unwrap(current)
        smaller, rest = train_test_split(
            current,
            train_size=0.5,
            shuffle=True,
            random_state=config.SPLIT_SEED,
            stratify=groups,
        )
        t_len = len(rest)
        if t_len < MIN_TRAINSET_SIZE:
            break
        exp1_trainsets.append(smaller)

    except ValueError:
        # Stop now if we encounter the error "The least populated class in y has only 1 member".
        break

[len(s) for s in exp1_trainsets] # Should be 

[6084, 3042, 1521, 760, 380, 190, 95, 47]

In [12]:
# Sanity checks

# Dev set should contain 676 examples
assert len(dev) == 676

# Test set should contain 1685 examples
assert len(test) == 1685

# Lenghts of exp1_trainsets should be fixed
assert sorted([len(s) for s in exp1_trainsets] ) == sorted([6084, 3042, 1521, 760, 380, 190, 95, 47])

In [13]:
from tools import createStatsTab

createStatsTab(train,dev,test)

Unnamed: 0_level_0,Train,Dev,Test,All
Entity type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PER,6085,676,1685,8446
ACT,4568,519,1094,6181
TITRE,290,42,76,408
LOC,6871,762,1788,9421
FT,55,7,14,76
CARDINAL,5991,678,1751,8420
All,23860,2684,6408,32952


In [14]:
# Save on disk
from dataset_util import save_dataset # Local import

output_directory = OUT_BASE / f"02-experiment_2_prepared_pero_ocr_dataset_{MODEL_NAME}"
output_directory.mkdir(exist_ok=True, parents=True) # Create if necessary
   
for train in exp1_trainsets:
    datasets = [train, dev, test]
    save_dataset(output_directory, datasets, ["train","dev","test"], suffix=len(train))

KeyboardInterrupt: 