# 120 - M1 - Independent-Layered NER Model - Experiment #2

Experiment 2 deals with Pero OCR annotated entries with IO/IOB2 labels and CmBERT/Pretrained CmBERT models.
* `121-camembert-multihead-io` : Fine-tuned CamemBERT-NER with IO labels
* `122-camembert-multihead-iob2`: Fine-tuned CamemBERT-NER with IOB2 labels
* `123-pretrained-camembert-multihead-io`: Fine-tuned Pretrained CamemBERT-NER with IO labels
* `124-pretrained-camembert-multihead-iob2`: Fine-tuned Pretrained CamemBERT-NER with IOB2 labels

In [None]:
#!pip install --upgrade transformers datasets spacy transformers[sentencepiece] seqeval

## Initialisation
Set the BASE path.
If run on Google Colab, will also mout Google Drive to the moutpoint given below

In [None]:
import os, sys
from pathlib import Path

ENV_IS_GOOGLE_COLAB = True if 'google.colab' in str(get_ipython()) else False
os.environ["ENV_IS_GOOGLE_COLAB"] = str(ENV_IS_GOOGLE_COLAB)

if ENV_IS_GOOGLE_COLAB:
  from google.colab import drive
  mountpoint = Path("/content/drive")
  drive.mount(str(mountpoint)) # Mount gdrive to BASE
  base = mountpoint / "MyDrive/article_icdar_2023" # Adapt this to your situation
  sys.path.append(str(base)) # Add BASE to Python Path
  BASE = Path(base).resolve() # Make BASE absolute
  DATASETS =  BASE / "dataset_ICDAR"
  OUT_BASE = BASE / "res_ICDAR/method_1"
else:
  BASE = Path().resolve() # Directory of this approach
  #Adapt this to your situation
  DATASETS = Path('../dataset_ICDAR').resolve() #Where your data are located befor Dataset object creation
  OUT_BASE = Path('../res_ICDAR/method_1').resolve() #Where you save the results of this notebook

print(sys.path)
print(BASE)
print(DATASETS)
print(OUT_BASE)

## Constants

In [None]:
RUN_CAMEMBERT_IO = False
RUN_CAMEMBERT_IOB2 = False
#Can't run together because of convert_tokenizer_
RUN_PTRN_CAMEMBERT_IO = False
RUN_PTRN_CAMEMBERT_IOB2 = True

# Number of times a model will be trained & evaluated on each a dataset
N_RUNS = 5

#Number of entities depth levels
NUMBER_OF_LEVELS = 2

**Don't forget to check tokenizer name in *model_util_io.py* and *model_util_iob2.py* files (same name as model) !**

## Parameters

In [None]:
TRAINING_CONFIG = {
    "evaluation_strategy": "steps",
    "eval_steps": 100,
    "max_steps": 5000,
    "learning_rate": 1e-4,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "weight_decay": 1e-5,
    "load_best_model_at_end": True,
    "greater_is_better":True,
    "metric_for_best_model": "f1",
    "save_strategy": "steps",
    "save_steps": 100,
    "save_total_limit": 1,
}

In [None]:
import pandas as pd
from datasets import load_from_disk

#Print examples from datasets
def loadExample(INPUT_DIR,set_length:int,i:int,subset:str):
    set_ = load_from_disk(INPUT_DIR / f"huggingface_{set_length}")
    data = {"tokens": set_[subset][i]["tokens"],
            "niv_1_tags": set_[subset][i][f"ner_tags_niv1"],
            "niv_2_tags": set_[subset][i][f"ner_tags_niv2"]}
    df = pd.DataFrame.from_dict(data)
    print(df)

## 121 - Train & eval : IO Pero OCR dataset with CamemBERT model

In [None]:
MODEL_NAME = "camembert_ner"
MODEL = "Jean-Baptiste/camembert-ner"
LABEL = "io"

### 121.1 Load dataset

In [None]:
import os
from pathlib import Path
from config import logger

# Expected datasets indexed by number of examples in the trainset
#TRAINSETS_SIZES = [47,95,190,380,760,1521,3042,6084] #To train on the 7 datasets
TRAINSETS_SIZES = [6084] #To train only on the biggest dataset

# INPUT / OUTPUT DIRS
INPUT_DIR = OUT_BASE / f"m1-experiment_2_prepared_dataset_pero_ocr_{LABEL}_{MODEL_NAME}"
METRICS_OUTPUT_DIR = OUT_BASE / "m1-120-experiment_2_metrics"
INPUT_DIR, METRICS_OUTPUT_DIR

In [None]:
loadExample(INPUT_DIR,TRAINSETS_SIZES[-1],22,"train")

### 121.2 Fine-tuning with IO labels - train & eval

In [None]:
from config import logger
from datasets import load_from_disk
import numpy as np
from multihead_utils.util_io import train_eval_loop, init_model
import json
    
def train_bert(metrics_output_directory,model_output_directory,local_config,max_levels_number):
    # Train & evaluate loop
    for run in range(1, N_RUNS + 1):
        output_dir = metrics_output_directory / f"run_{run}"
        output_dir.mkdir(exist_ok=True)

        for i in range(1,max_levels_number+1):
        
            for trainset_size in TRAINSETS_SIZES:
                datasetdir = INPUT_DIR / f"huggingface_{trainset_size}"
                logger.info(f"Running on datasets in {datasetdir}")
                logger.info(f"Metrics will be saved in {output_dir}")

                # Load data
                train_dev_test = load_from_disk(datasetdir)
                train_dev_test_ = train_dev_test.rename_column("labels_niv"+str(i), "labels")
                train_dev_test_ = train_dev_test_.rename_column("ner_tags_niv"+str(i), "ner_tags")
                
                
                conf =  local_config.copy()
                conf["output_dir"] = model_output_directory / f"level-{str(i)}"
                
                # Get the model components
                model, tokenizer, training_args = init_model(MODEL, conf, run)
                logger.info(f"{model} #{run}, will save in {output_dir}")
                
                #Update metrics outputdir
                output_dir_metrics = output_dir / f"level-{str(i)}"
                output_dir_metrics.mkdir(exist_ok=True)
                
                metrics = train_eval_loop(model,         # Implicit. Must be setbefore calling train_bert()
                                          training_args, # Idem
                                          tokenizer,
                                          **train_dev_test_)

                # Save the metrics
                metrics_file = output_dir_metrics / f"test_{trainset_size}.json"
                with open(metrics_file, "w", encoding="utf-8") as o:
                    json.dump(metrics[0], o)

                metrics_file = output_dir_metrics / f"dev_{trainset_size}.json"
                with open(metrics_file, "w", encoding="utf-8") as o:
                    json.dump(metrics[1], o)

                train_dev_test_ = train_dev_test_.rename_column("labels","labels_niv"+str(i))
                train_dev_test_ = train_dev_test_.rename_column("ner_tags","ner_tags_niv"+str(i))
                
                torch.cuda.empty_cache()

In [None]:
if RUN_CAMEMBERT_IO:
    from multihead_utils.util_io import init_model
    import time
    import datetime

    h = time.time()
    
    # CONSTS
    MODEL_METRICS_DIR = METRICS_OUTPUT_DIR / f"121-camembert-ner-multihead-io"
    MODEL_METRICS_DIR.mkdir(exist_ok=True, parents=True)
    MODEL_OUTPUT_MODEL_PATH = OUT_BASE / f"tmp/121-camembert-ner-multihead-io"
    MODEL_METRICS_DIR, MODEL_OUTPUT_MODEL_PATH

    # Run the main loop
    train_bert(MODEL_METRICS_DIR,
                   MODEL_OUTPUT_MODEL_PATH,
                   TRAINING_CONFIG,
                   NUMBER_OF_LEVELS
                  )
        
    print(f"{MODEL} fine-tuning with IO labels on level {str(i)} is over.")
    runtime = (time.time()- h)/N_RUNS
    print(f"Run-time is equal to {str(datetime.timedelta(seconds=runtime))}")
    torch.cuda.empty_cache()
else:
    print(f"Skipped finetuning {MODEL} for IO labels")

## 122 - Train & eval : IOB2 Pero OCR dataset with CamemBERT model

In [None]:
MODEL_NAME = "camembert_ner"
MODEL = "Jean-Baptiste/camembert-ner"
LABEL = "iob2"

### 122.1 Load dataset

In [None]:
import os
from pathlib import Path
from config import logger

# Expected datasets indexed by number of examples in the trainset
#TRAINSETS_SIZES = [47,95,190,380,760,1521,3042,6084] #To train on the 7 datasets
TRAINSETS_SIZES = [6084] #To train only on the biggest dataset

# INPUT / OUTPUT DIRS
INPUT_DIR = OUT_BASE / f"m1-experiment_2_prepared_dataset_pero_ocr_{LABEL}_{MODEL_NAME}"
METRICS_OUTPUT_DIR = OUT_BASE / "m1-120-experiment_2_metrics"
INPUT_DIR, METRICS_OUTPUT_DIR

In [None]:
loadExample(INPUT_DIR,TRAINSETS_SIZES[-1],22,"train")

In [None]:
from config import logger
from datasets import load_from_disk
from multihead_utils.util_iob2 import train_eval_loop, init_model
import json
    
def train_bert(metrics_output_directory,model_output_directory,local_config,max_levels_number):
    # Train & evaluate loop
    for run in range(1, N_RUNS + 1):
        output_dir = metrics_output_directory / f"run_{run}"
        output_dir.mkdir(exist_ok=True)

        for i in range(1,max_levels_number+1):
        
            for trainset_size in TRAINSETS_SIZES:
                datasetdir = INPUT_DIR / f"huggingface_{trainset_size}"
                logger.info(f"Running on datasets in {datasetdir}")
                logger.info(f"Metrics will be saved in {output_dir}")

                # Load data
                train_dev_test = load_from_disk(datasetdir)
                train_dev_test_ = train_dev_test.rename_column("labels_niv"+str(i), "labels")
                train_dev_test_ = train_dev_test_.rename_column("ner_tags_niv"+str(i), "ner_tags")
                
                
                conf =  local_config.copy()
                conf["output_dir"] = model_output_directory / f"level-{str(i)}"
                
                # Get the model components
                model, tokenizer, training_args = init_model(MODEL, conf, run)
                logger.info(f"{model} #{run}, will save in {output_dir}")
                
                #Update metrics outputdir
                output_dir_metrics = output_dir / f"level-{str(i)}"
                output_dir_metrics.mkdir(exist_ok=True)
                
                metrics = train_eval_loop(model,         # Implicit. Must be setbefore calling train_bert()
                                          training_args, # Idem
                                          tokenizer,
                                          **train_dev_test_)

                # Save the metrics
                metrics_file = output_dir_metrics / f"test_{trainset_size}.json"
                with open(metrics_file, "w", encoding="utf-8") as o:
                    json.dump(metrics[0], o)

                metrics_file = output_dir_metrics / f"dev_{trainset_size}.json"
                with open(metrics_file, "w", encoding="utf-8") as o:
                    json.dump(metrics[1], o)

                train_dev_test_ = train_dev_test_.rename_column("labels","labels_niv"+str(i))
                train_dev_test_ = train_dev_test_.rename_column("ner_tags","ner_tags_niv"+str(i))
                
                torch.cuda.empty_cache()

### 122.2 Fine-tuning with IOB2 labels - train & eval

In [None]:
if RUN_CAMEMBERT_IOB2:
    import time
    import datetime

    h = time.time()
    
    # CONSTS
    MODEL_METRICS_DIR = METRICS_OUTPUT_DIR / f"122-camembert-ner-multihead-iob2"
    MODEL_METRICS_DIR.mkdir(exist_ok=True, parents=True)
    MODEL_OUTPUT_MODEL_PATH = OUT_BASE / f"tmp/122-camembert-ner-multihead-iob2"
    MODEL_METRICS_DIR, MODEL_OUTPUT_MODEL_PATH

    # Run the main loop
    train_bert(MODEL_METRICS_DIR,
                   MODEL_OUTPUT_MODEL_PATH,
                   TRAINING_CONFIG,
                   NUMBER_OF_LEVELS
                  )
        
    print(f"{MODEL} fine-tuning with IOB2 labels is over.")
    runtime = (time.time()- h)/N_RUNS
    print(f"Run-time is equal to {str(datetime.timedelta(seconds=runtime))}")
    torch.cuda.empty_cache()
else:
    print(f"Skipped finetuning {MODEL} for IOB2 labels")

## 123 - Train & eval : IO Pero OCR dataset with Pretrain CamemBERT model

In [None]:
MODEL_NAME = "pretrained_camembert_ner"
MODEL = "HueyNemud/das22-10-camembert_pretrained"
LABEL = "io"

### 123.1 Load dataset

In [None]:
import os
from pathlib import Path
from config import logger

# Expected datasets indexed by number of examples in the trainset
#TRAINSETS_SIZES = [47,95,190,380,760,1521,3042,6084] #To train on the 7 datasets
TRAINSETS_SIZES = [6084] #To train only on the biggest dataset

# INPUT / OUTPUT DIRS
INPUT_DIR = OUT_BASE / f"m1-experiment_2_prepared_dataset_pero_ocr_{LABEL}_{MODEL_NAME}"
METRICS_OUTPUT_DIR = OUT_BASE / "m1-120-experiment_2_metrics"
INPUT_DIR, METRICS_OUTPUT_DIR

In [None]:
loadExample(INPUT_DIR,TRAINSETS_SIZES[-1],22,"train")

### 123.2 Fine-tuning with IO labels - train & eval

In [None]:
from config import logger
from datasets import load_from_disk
import numpy as np
from multihead_utils.util_io import train_eval_loop, init_model
import json
    
def train_bert(metrics_output_directory,model_output_directory,local_config,max_levels_number):
    # Train & evaluate loop
    for run in range(1, N_RUNS + 1):
        output_dir = metrics_output_directory / f"run_{run}"
        output_dir.mkdir(exist_ok=True)

        for i in range(1,max_levels_number+1):
        
            for trainset_size in TRAINSETS_SIZES:
                datasetdir = INPUT_DIR / f"huggingface_{trainset_size}"
                logger.info(f"Running on datasets in {datasetdir}")
                logger.info(f"Metrics will be saved in {output_dir}")

                # Load data
                train_dev_test = load_from_disk(datasetdir)
                train_dev_test_ = train_dev_test.rename_column("labels_niv"+str(i), "labels")
                train_dev_test_ = train_dev_test_.rename_column("ner_tags_niv"+str(i), "ner_tags")
                
                
                conf =  local_config.copy()
                conf["output_dir"] = model_output_directory / f"level-{str(i)}"
                
                # Get the model components
                model, tokenizer, training_args = init_model(MODEL, conf, run)
                logger.info(f"{model} #{run}, will save in {output_dir}")
                
                #Update metrics outputdir
                output_dir_metrics = output_dir / f"level-{str(i)}"
                output_dir_metrics.mkdir(exist_ok=True)
                
                metrics = train_eval_loop(model,         # Implicit. Must be setbefore calling train_bert()
                                          training_args, # Idem
                                          tokenizer,
                                          **train_dev_test_)

                # Save the metrics
                metrics_file = output_dir_metrics / f"test_{trainset_size}.json"
                with open(metrics_file, "w", encoding="utf-8") as o:
                    json.dump(metrics[0], o)

                metrics_file = output_dir_metrics / f"dev_{trainset_size}.json"
                with open(metrics_file, "w", encoding="utf-8") as o:
                    json.dump(metrics[1], o)

                train_dev_test_ = train_dev_test_.rename_column("labels","labels_niv"+str(i))
                train_dev_test_ = train_dev_test_.rename_column("ner_tags","ner_tags_niv"+str(i))
                
                torch.cuda.empty_cache()

In [None]:
if RUN_PTRN_CAMEMBERT_IO:
    import time
    import datetime

    h = time.time()
    
    # CONSTS
    MODEL_METRICS_DIR = METRICS_OUTPUT_DIR / f"123-pretrained-camembert-ner-multihead-io"
    MODEL_METRICS_DIR.mkdir(exist_ok=True, parents=True)
    MODEL_OUTPUT_MODEL_PATH = OUT_BASE / f"tmp/123-pretrained-camembert-ner-multihead-io"
    MODEL_METRICS_DIR, MODEL_OUTPUT_MODEL_PATH

    # Run the main loop
    train_bert(MODEL_METRICS_DIR,
                   MODEL_OUTPUT_MODEL_PATH,
                   TRAINING_CONFIG,
                   NUMBER_OF_LEVELS
                  )
        
    print(f"Model fine-tuning with IO labels is over.")
    runtime = (time.time()- h)/N_RUNS
    print(f"Run-time is equal to {str(datetime.timedelta(seconds=runtime))}")
    torch.cuda.empty_cache()
else:
    print(f"Skipped finetuning {MODEL} for IO labels")

## 124 - Train & eval : IOB2 Pero OCR dataset with Pretrain CamemBERT model

In [None]:
MODEL_NAME = "pretrained_camembert_ner"
MODEL = "HueyNemud/das22-10-camembert_pretrained"
LABEL = "iob2"

### 124.1 Load dataset

In [None]:
import os
from pathlib import Path
from config import logger

# Expected datasets indexed by number of examples in the trainset
#TRAINSETS_SIZES = [47,95,190,380,760,1521,3042,6084] #To train on the 7 datasets
TRAINSETS_SIZES = [6084] #To train only on the biggest dataset

# INPUT / OUTPUT DIRS
INPUT_DIR = OUT_BASE / f"m1-experiment_2_prepared_dataset_pero_ocr_{LABEL}_{MODEL_NAME}"
METRICS_OUTPUT_DIR = OUT_BASE / "m1-120-experiment_2_metrics"
INPUT_DIR, METRICS_OUTPUT_DIR

In [None]:
loadExample(INPUT_DIR,TRAINSETS_SIZES[-1],22,"train")

In [None]:
from config import logger
from datasets import load_from_disk
from multihead_utils.util_iob2 import train_eval_loop, init_model
import json

def train_bert(metrics_output_directory,model_output_directory,local_config,max_levels_number):
    # Train & evaluate loop
    for run in range(1, N_RUNS + 1):
        output_dir = metrics_output_directory / f"run_{run}"
        output_dir.mkdir(exist_ok=True)

        for i in range(1,max_levels_number+1):
        
            for trainset_size in TRAINSETS_SIZES:
                datasetdir = INPUT_DIR / f"huggingface_{trainset_size}"
                logger.info(f"Running on datasets in {datasetdir}")
                logger.info(f"Metrics will be saved in {output_dir}")

                # Load data
                train_dev_test = load_from_disk(datasetdir)
                train_dev_test_ = train_dev_test.rename_column("labels_niv"+str(i), "labels")
                train_dev_test_ = train_dev_test_.rename_column("ner_tags_niv"+str(i), "ner_tags")
                
                
                conf =  local_config.copy()
                conf["output_dir"] = model_output_directory / f"level-{str(i)}"
                
                # Get the model components
                model, tokenizer, training_args = init_model(MODEL, conf, run)
                logger.info(f"{model} #{run}, will save in {output_dir}")
                
                #Update metrics outputdir
                output_dir_metrics = output_dir / f"level-{str(i)}"
                output_dir_metrics.mkdir(exist_ok=True)
                
                metrics = train_eval_loop(model,         # Implicit. Must be setbefore calling train_bert()
                                          training_args, # Idem
                                          tokenizer,
                                          **train_dev_test_)

                # Save the metrics
                metrics_file = output_dir_metrics / f"test_{trainset_size}.json"
                with open(metrics_file, "w", encoding="utf-8") as o:
                    json.dump(metrics[0], o)

                metrics_file = output_dir_metrics / f"dev_{trainset_size}.json"
                with open(metrics_file, "w", encoding="utf-8") as o:
                    json.dump(metrics[1], o)

                train_dev_test_ = train_dev_test_.rename_column("labels","labels_niv"+str(i))
                train_dev_test_ = train_dev_test_.rename_column("ner_tags","ner_tags_niv"+str(i))
                
                torch.cuda.empty_cache()

### 124.2 Fine-tuning with IOB2 labels - train & eval

In [None]:
if RUN_PTRN_CAMEMBERT_IOB2:
    import time
    import datetime

    h = time.time()
    
    # CONSTS
    MODEL_METRICS_DIR = METRICS_OUTPUT_DIR / f"124-pretrained-camembert-multihead-iob2"
    MODEL_METRICS_DIR.mkdir(exist_ok=True, parents=True)
    MODEL_OUTPUT_MODEL_PATH = OUT_BASE / f"tmp/124-pretrained-camembert-multihead-iob2"
    MODEL_METRICS_DIR, MODEL_OUTPUT_MODEL_PATH

    # Run the main loop
    train_bert(MODEL_METRICS_DIR,
                   MODEL_OUTPUT_MODEL_PATH,
                   TRAINING_CONFIG,
                   NUMBER_OF_LEVELS
                  )
        
    print(f"{MODEL} fine-tuning with IOB2 labels is over.")
    runtime = (time.time()- h)/N_RUNS
    print(f"Run-time is equal to {str(datetime.timedelta(seconds=runtime))}")
    torch.cuda.empty_cache()
else:
    print(f"Skipped finetuning {MODEL} for IOB2 labels")