# 100 - Independant NER layers - Datasets generation

Outputs:
Train, dev & test datasets for multilayers NER experiment 1 (with ref dataset) and experiment 2 (with pero OCR dataset)

<b>Experiment 1 : Groundtruth dataset</b>
* `m1-experiment_1_prepared_dataset_ref_io_camembert_ner`
* `m1-experiment_1_prepared_dataset_ref_io_pretrained_camembert_ner`
* `m1-experiment_1_prepared_dataset_ref_iob2_camembert_ner`
* `m1-experiment_1_prepared_dataset_ref_iob2_pretrained_camembert_ner`

<b>Experiment 2 : Pero OCR dataset</b>
* `m1-experiment_2_prepared_dataset_pero_ocr_io_camembert_ner`
* `m1-experiment_2_prepared_dataset_pero_ocr_io_pretrained_camembert_ner`
* `m1-experiment_2_prepared_dataset_pero_ocr_iob2_camembert_ner`
* `m1-experiment_2_prepared_dataset_pero_ocr_iob2_pretrained_camembert_ner`

## Initialization

In [None]:
import os, sys
from pathlib import Path

ENV_IS_GOOGLE_COLAB = True if 'google.colab' in str(get_ipython()) else False
os.environ["ENV_IS_GOOGLE_COLAB"] = str(ENV_IS_GOOGLE_COLAB)

if ENV_IS_GOOGLE_COLAB:
  from google.colab import drive
  mountpoint = Path("/content/drive")
  drive.mount(str(mountpoint)) # Mount gdrive to BASE
  base = mountpoint / "MyDrive/article_icdar_2023" # Adapt this to your situation
  sys.path.append(str(base)) # Add BASE to Python Path
  BASE = Path(base).resolve() # Make BASE absolute
  DATASETS =  BASE / "dataset_ICDAR"
  OUT_BASE = BASE / "res_ICDAR/method_1"
else:
  BASE = Path().resolve() # Directory of this approach
  #Adapt this to your situation
  DATASETS = Path('../dataset_ICDAR').resolve() #Where your data are located befor Dataset object creation
  OUT_BASE = Path('../res_ICDAR/method_1').resolve() #Where you save the results of this notebook

print(sys.path)
print(BASE)
print(DATASETS)
print(OUT_BASE)

## Constants

**Tokenizer load in *util_io.py* and *util_iob2.py* has to be the same that the model to fine-tune.<br>
Please check *_convert_tokenizer* var in those files and adapt MODEL_NAME variable in this notebook before creating datasets.** 

In [None]:
# Save on disk
from multihead_utils.util_io import _convert_tokenizer
print("Tokenizer called in util_io.py")
_convert_tokenizer.name_or_path

In [None]:
from multihead_utils.util_iob2 import _convert_tokenizer
print("Tokenizer called in util_iob2.py")
_convert_tokenizer.name_or_path

In [None]:
# GLOBAL CONSTANTS
import config
import os
from pathlib import Path

config.SPLIT_SEED = 42 # Random seed used in train/dev/test. Do not change it if you want to recreate the paper results.
config.DEBUG = False # If true, text versions of the spacy datasets will be saved along with the .spacy files.

In [None]:
MODEL_NAME = "pretrained_camembert_ner" #Used for folder name, no space, different from real model name
#camembert_ner OR
#pretrained_camembert_ner

## 101. Experiment #1 : Reference dataset

In [None]:
import numpy as np
import pandas as pd
import csv

GOLD_REF = DATASETS / "41-ner_ref_from_pero/gold.csv"
assert GOLD_REF.exists()

with open(GOLD_REF,'r',encoding='utf8') as f:
    lines = f.readlines()
    res = []
    for line in lines:
        l = line.split('", "')
        res.append([l[0][1:],l[1][:-2]])
gold_reference = pd.DataFrame(res,columns=["ner_xml","book"])

In [None]:
#TITRE-H and TITRE-P labels to transformers NER Pipeline
for i in range(len(gold_reference)):
    if '<TITRE-H>' in gold_reference['ner_xml'][i]:
        gold_reference['ner_xml'][i] = gold_reference['ner_xml'][i].replace('TITRE-H','TITREH')
    if '<TITRE-P>' in gold_reference['ner_xml'][i]:
        gold_reference['ner_xml'][i] = gold_reference['ner_xml'][i].replace('TITRE-P','TITREP')

gold_reference

In [None]:
from multihead_utils.multihead_dataset_util import train_dev_test_split, unwrap # Local imports
from sklearn.model_selection import train_test_split

# CONSTANTS
MIN_TRAINSET_SIZE = 30

 # Split 72/8/20% w. stratified sampling on directories names
train, dev, test = train_dev_test_split(gold_reference.to_numpy())
print("Dev : "+str(len(dev)))
print("Test : "+str(len(test)))

# Iteratively split the trainset in half to create smaller trainsets
exp1_trainsets = [train]
t_len = len(train)

while True:
    try:
        current = exp1_trainsets[-1]
        _, groups = unwrap(current)
        smaller, rest = train_test_split(
            current,
            train_size=0.5,
            shuffle=True,
            random_state=config.SPLIT_SEED,
            stratify=groups,
        )
        t_len = len(rest)
        if t_len < MIN_TRAINSET_SIZE:
            break
        exp1_trainsets.append(smaller)

    except ValueError:
        # Stop now if we encounter the error "The least populated class in y has only 1 member".
        break

[len(s) for s in exp1_trainsets] # Should be 

In [None]:
# Sanity checks

# Dev set should contain 676 examples
assert len(dev) == 676

# Test set should contain 1685 examples
assert len(test) == 1685

# Lenghts of exp1_trainsets should be fixed
assert sorted([len(s) for s in exp1_trainsets] ) == sorted([6084, 3042, 1521, 760, 380, 190, 95, 47])

In [None]:
from multihead_utils.tools import createStatsTab

createStatsTab(train,dev,test)

### IO Labels

In [None]:
# Save on disk
from multihead_utils.util_io import save_dataset # Local import

output_directory = OUT_BASE / f"m1-experiment_1_prepared_dataset_ref_io_{MODEL_NAME}"
output_directory.mkdir(exist_ok=True, parents=True) # Create if necessary
   
for train in exp1_trainsets:
    datasets = [train, dev, test]
    save_dataset(output_directory, datasets, ["train","dev","test"], suffix=len(train))
output_directory

### IOB2 Labels

In [None]:
# Save on disk
from multihead_utils.util_iob2 import save_dataset # Local import

output_directory = OUT_BASE / f"m1-experiment_1_prepared_dataset_ref_iob2_{MODEL_NAME}"
output_directory.mkdir(exist_ok=True, parents=True) # Create if necessary
   
for train in exp1_trainsets:
    datasets = [train, dev, test]
    save_dataset(output_directory, datasets, ["train","dev","test"], suffix=len(train))
output_directory

## 102. Experiment 2 : Pero OCR dataset

In [None]:
import numpy as np
import pandas as pd
import csv

GOLD_REF = DATASETS / "31-ner_align_pero/gold.csv"

gold_reference = pd.read_csv(GOLD_REF, header=None, names=["ner_xml","book"],skipinitialspace='True')

In [None]:
#TITRE-H and TITRE-P labels to transformers NER Pipeline
for i in range(len(gold_reference)):
    if '<TITRE-H>' in gold_reference['ner_xml'][i]:
        gold_reference['ner_xml'][i] = gold_reference['ner_xml'][i].replace('TITRE-H','TITREH')
    if '<TITRE-P>' in gold_reference['ner_xml'][i]:
        gold_reference['ner_xml'][i] = gold_reference['ner_xml'][i].replace('TITRE-P','TITREP')
gold_reference

In [None]:
from multihead_utils.multihead_dataset_util import train_dev_test_split, unwrap # Local imports
from sklearn.model_selection import train_test_split

# CONSTANTS
MIN_TRAINSET_SIZE = 30

 # Split 72/8/20% w. stratified sampling on directories names
train, dev, test = train_dev_test_split(gold_reference.to_numpy())
print("Dev : "+str(len(dev)))
print("Test : "+str(len(test)))

# Iteratively split the trainset in half to create smaller trainsets
exp1_trainsets = [train]
t_len = len(train)

while True:
    try:
        current = exp1_trainsets[-1]
        _, groups = unwrap(current)
        smaller, rest = train_test_split(
            current,
            train_size=0.5,
            shuffle=True,
            random_state=config.SPLIT_SEED,
            stratify=groups,
        )
        t_len = len(rest)
        if t_len < MIN_TRAINSET_SIZE:
            break
        exp1_trainsets.append(smaller)

    except ValueError:
        # Stop now if we encounter the error "The least populated class in y has only 1 member".
        break

[len(s) for s in exp1_trainsets] # Should be 

In [None]:
# Sanity checks

# Dev set should contain 676 examples
assert len(dev) == 676

# Test set should contain 1685 examples
assert len(test) == 1685

# Lenghts of exp1_trainsets should be fixed
assert sorted([len(s) for s in exp1_trainsets] ) == sorted([6084, 3042, 1521, 760, 380, 190, 95, 47])

In [None]:
from multihead_utils.tools import createStatsTab

createStatsTab(train,dev,test)

### IO Labels

In [None]:
# Save on disk
from multihead_utils.util_io import save_dataset # Local import

output_directory = OUT_BASE / f"m1-experiment_2_prepared_dataset_pero_ocr_io_{MODEL_NAME}"
output_directory.mkdir(exist_ok=True, parents=True) # Create if necessary
   
for train in exp1_trainsets:
    datasets = [train, dev, test]
    save_dataset(output_directory, datasets, ["train","dev","test"], suffix=len(train))
output_directory

In [None]:
#Example
import pandas as pd
from datasets import load_from_disk

dataset_root = f"huggingface_6084"
datasetdir = output_directory / dataset_root
train_dev_test = load_from_disk(datasetdir)

print(train_dev_test["train"][3]["tokens"])
print(train_dev_test["train"][3]["ner_tags_niv1"])
print(train_dev_test["train"][3]["ner_tags_niv2"])

### IOB2 Labels

In [None]:
# Save on disk
from multihead_utils.util_iob2 import save_dataset # Local import

output_directory = OUT_BASE / f"m1-experiment_2_prepared_dataset_pero_ocr_iob2_{MODEL_NAME}"
output_directory.mkdir(exist_ok=True, parents=True) # Create if necessary
   
for train in exp1_trainsets:
    datasets = [train, dev, test]
    save_dataset(output_directory, datasets, ["train","dev","test"], suffix=len(train))
output_directory