# 200 - Datasets generation for nested-NER

Outputs:
Train, dev & test datasets for multilayers NER experiment 1 (with ref dataset) and experiment 2 (with pero OCR dataset)

<b>Experiment 1 : Groundtruth dataset</b>
* `m2-experiment_1_prepared_dataset_ref_io_camembert_ner`
* `m2-experiment_1_prepared_dataset_ref_io_pretrained_camembert_ner`
* `m2-experiment_1_prepared_dataset_ref_iob2_camembert_ner`
* `m2-experiment_1_prepared_dataset_ref_iob2_pretrained_camembert_ner`

<b>Experiment 2 : Pero OCR dataset</b>
* `m2-experiment_2_prepared_dataset_pero_ocr_io_camembert_ner`
* `m2-experiment_2_prepared_dataset_pero_ocr_io_pretrained_camembert_ner`
* `m2-experiment_2_prepared_dataset_pero_ocr_iob2_camembert_ner`
* `m2-experiment_2_prepared_dataset_pero_ocr_iob2_pretrained_camembert_ner`

## Initialization

In [None]:
import os, sys
from pathlib import Path

ENV_IS_GOOGLE_COLAB = True if 'google.colab' in str(get_ipython()) else False
os.environ["ENV_IS_GOOGLE_COLAB"] = str(ENV_IS_GOOGLE_COLAB)

if ENV_IS_GOOGLE_COLAB:
  from google.colab import drive
  mountpoint = Path("/content/drive")
  drive.mount(str(mountpoint)) # Mount gdrive to BASE
  base = mountpoint / "MyDrive/article_icdar_2023" # Adapt this to your situation
  sys.path.append(str(base)) # Add BASE to Python Path
  BASE = Path(base).resolve() # Make BASE absolute
  DATASETS =  BASE / "dataset_ICDAR"
  OUT_BASE = BASE / "res_ICDAR/method_2"
else:
  BASE = Path().resolve() # Directory of this approach
  #Adapt this to your situation
  DATASETS = Path('../dataset_ICDAR').resolve() #Where your data are located befor Dataset object creation
  OUT_BASE = Path('../res_ICDAR/method_2').resolve() #Where you save the results of this notebook

print(sys.path)
print(BASE)
print(DATASETS)
print(OUT_BASE)

## Constants

In [None]:
# GLOBAL CONSTANTS
import config
import os
from pathlib import Path

config.SPLIT_SEED = 42 # Random seed used in train/dev/test. Do not change it if you want to recreate the paper results.
config.DEBUG = False # If true, text versions of the spacy datasets will be saved along with the .spacy files.

MODEL_NAME = "pretrained_camembert_ner"
#camembert_ner OR
#pretrained_camembert_ner

In [None]:
# Save on disk
from camembert_utils.util_IO import _convert_tokenizer
print("Tokenizer called in util_IO.py")
_convert_tokenizer.name_or_path

In [None]:
# Save on disk
from camembert_utils.util_IOB2 import _convert_tokenizer
print("Tokenizer called in util_IOB2.py")
_convert_tokenizer.name_or_path

# 01. Experiment #1 : Reference dataset with joint-labels

In [None]:
import numpy as np
import pandas as pd
import csv
GOLD_REF = DATASETS / "41-ner_ref_from_pero/gold.csv"
assert GOLD_REF.exists()

#gold_reference = pd.read_csv(GOLD_REF, header=None, names=["ner_xml","book"],skipinitialspace='True')
with open(GOLD_REF,'r',encoding='utf8') as f:
    lines = f.readlines()
    res = []
    for line in lines:
        l = line.split('", "')
        res.append([l[0][1:],l[1][:-2]])
gold_reference = pd.DataFrame(res,columns=["ner_xml","book"])

In [None]:
#TITRE-H and TITRE-P labels to transformers NER Pipeline
for i in range(len(gold_reference)):
    if '<TITRE-H>' in gold_reference['ner_xml'][i]:
        gold_reference['ner_xml'][i] = gold_reference['ner_xml'][i].replace('TITRE-H','TITREH')
    if '<TITRE-P>' in gold_reference['ner_xml'][i]:
        gold_reference['ner_xml'][i] = gold_reference['ner_xml'][i].replace('TITRE-P','TITREP')
gold_reference

In [None]:
from dataset_util import train_dev_test_split, unwrap # Local imports
from sklearn.model_selection import train_test_split

# CONSTANTS
MIN_TRAINSET_SIZE = 30

 # Split 72/8/20% w. stratified sampling on directories names
train, dev, test = train_dev_test_split(gold_reference.to_numpy())

# Iteratively split the trainset in half to create smaller trainsets
exp1_trainsets = [train]
t_len = len(train)

while True:
    try:
        current = exp1_trainsets[-1]
        _, groups = unwrap(current)
        smaller, rest = train_test_split(
            current,
            train_size=0.5,
            shuffle=True,
            random_state=config.SPLIT_SEED,
            stratify=groups,
        )
        t_len = len(rest)
        if t_len < MIN_TRAINSET_SIZE:
            break
        exp1_trainsets.append(smaller)

    except ValueError:
        # Stop now if we encounter the error "The least populated class in y has only 1 member".
        break

[len(s) for s in exp1_trainsets] # Should be 

In [None]:
# Sanity checks

# Dev set should contain 676 examples
assert len(dev) == 676

# Test set should contain 1685 examples
assert len(test) == 1685

# Lenghts of exp1_trainsets should be fixed
assert sorted([len(s) for s in exp1_trainsets] ) == sorted([6084, 3042, 1521, 760, 380, 190, 95, 47])

In [None]:
from camembert_utils.tools import createStatsTab

createStatsTab(train,dev,test)

## IO Labels

In [None]:
from camembert_utils.util_IO import save_dataset_io # Local import

# Save on disk
output_directory = OUT_BASE / f"m2-experiment_1_prepared_dataset_ref_io_{MODEL_NAME}"
output_directory.mkdir(exist_ok=True, parents=True) # Create if necessary
print(output_directory)

for train in exp1_trainsets:
    datasets = [train, dev, test]
    save_dataset_io(output_directory, datasets, ["train","dev","test"], suffix=len(train))

### IOB2 Labels

In [None]:
from camembert_utils.util_IOB2 import save_dataset_iob2 # Local import

# Save on disk
output_directory = OUT_BASE / f"m2-experiment_1_prepared_dataset_ref_iob2_{MODEL_NAME}"
output_directory.mkdir(exist_ok=True, parents=True) # Create if necessary
print(output_directory)

for train in exp1_trainsets:
    datasets = [train, dev, test]
    save_dataset_iob2(output_directory, datasets, ["train","dev","test"], suffix=len(train))

# 0.2 # Experiment 2 : Pero OCR Dataset

In [None]:
import numpy as np
import pandas as pd
import csv

GOLD_REF = DATASETS / "31-ner_align_pero/gold.csv"
gold_reference = pd.read_csv(GOLD_REF, header=None, names=["ner_xml","book"],skipinitialspace='True')

In [None]:
#TITRE-H and TITRE-P labels to transformers NER Pipeline
for i in range(len(gold_reference)):
    if '<TITRE-H>' in gold_reference['ner_xml'][i]:
        gold_reference['ner_xml'][i] = gold_reference['ner_xml'][i].replace('TITRE-H','TITREH')
    if '<TITRE-P>' in gold_reference['ner_xml'][i]:
        gold_reference['ner_xml'][i] = gold_reference['ner_xml'][i].replace('TITRE-P','TITREP')
gold_reference

In [None]:
from dataset_util import train_dev_test_split, unwrap # Local imports
from sklearn.model_selection import train_test_split

# CONSTANTS
MIN_TRAINSET_SIZE = 30

 # Split 72/8/20% w. stratified sampling on directories names
train, dev, test = train_dev_test_split(gold_reference.to_numpy())

# Iteratively split the trainset in half to create smaller trainsets
exp1_trainsets = [train]
t_len = len(train)

while True:
    try:
        current = exp1_trainsets[-1]
        _, groups = unwrap(current)
        smaller, rest = train_test_split(
            current,
            train_size=0.5,
            shuffle=True,
            random_state=config.SPLIT_SEED,
            stratify=groups,
        )
        t_len = len(rest)
        if t_len < MIN_TRAINSET_SIZE:
            break
        exp1_trainsets.append(smaller)

    except ValueError:
        # Stop now if we encounter the error "The least populated class in y has only 1 member".
        break

[len(s) for s in exp1_trainsets] # Should be 

In [None]:
# Sanity checks

# Dev set should contain 676 examples
assert len(dev) == 676

# Test set should contain 1685 examples
assert len(test) == 1685

# Lenghts of exp1_trainsets should be fixed
assert sorted([len(s) for s in exp1_trainsets] ) == sorted([6084, 3042, 1521, 760, 380, 190, 95, 47])

### Save test dataset

In [None]:
#Save test subset in csv format for qualitative analysis
with open("./pero_ocr_dataset_test_subset.csv",'w',encoding='utf8') as tfile:
    for line in test:
        tfile.write('"' + line[0] + '", "' + line[1] + '"\n')

### Entities count

In [None]:
from camembert_utils.tools import createStatsTab

createStatsTab(train,dev,test)

### IO Labels

In [None]:
from camembert_utils.util_IO import save_dataset_io # Local import

# Save on disk
output_directory = OUT_BASE / f"m2-experiment_2_prepared_dataset_pero_ocr_io_{MODEL_NAME}"
output_directory.mkdir(exist_ok=True, parents=True) # Create if necessary
print(output_directory)

for train in exp1_trainsets:
    datasets = [train, dev, test]
    save_dataset_io(output_directory, datasets, ["train","dev","test"], suffix=len(train))

### IOB2 Labels

In [None]:
from camembert_utils.util_IOB2 import save_dataset_iob2 # Local import

# Save on disk
output_directory = OUT_BASE / f"m2-experiment_2_prepared_dataset_pero_ocr_iob2_{MODEL_NAME}"
output_directory.mkdir(exist_ok=True, parents=True) # Create if necessary
print(output_directory)

for train in exp1_trainsets:
    datasets = [train, dev, test]
    save_dataset_iob2(output_directory, datasets, ["train","dev","test"], suffix=len(train))