# 10 - Flat NER - Experiment #1

Fine-tune models with ground-truth datasets.

**Outputs :**
* `11-flat-ner-ref-camembert_ner`
* `12-flat-ner-ref-pretrained_camembert_ner`

In [None]:
#!pip install --upgrade transformers datasets spacy transformers[sentencepiece] seqeval

## Initialisation
Set the BASE path.
If run on Google Colab, will also mout Google Drive to the moutpoint given below

In [None]:
import os, sys
from pathlib import Path

ENV_IS_GOOGLE_COLAB = True if 'google.colab' in str(get_ipython()) else False
os.environ["ENV_IS_GOOGLE_COLAB"] = str(ENV_IS_GOOGLE_COLAB)

if ENV_IS_GOOGLE_COLAB:
  from google.colab import drive
  mountpoint = Path("/content/drive")
  drive.mount(str(mountpoint)) # Mount gdrive to BASE
  base = mountpoint / "MyDrive/article_icdar_2023" # Adapt this to your situation
  sys.path.append(str(base)) # Add BASE to Python Path
  BASE = Path(base).resolve() # Make BASE absolute
  DATASETS =  BASE / "dataset_ICDAR"
  OUT_BASE = BASE / "res_ICDAR/method_0"
else:
  BASE = Path().resolve() # Directory of this approach
  #Adapt this to your situation
  DATASETS = Path('../dataset_ICDAR').resolve() #Where your data are located befor Dataset object creation
  OUT_BASE = Path('../res_ICDAR/method_0').resolve() #Where you save the results of this notebook

print(sys.path)
print(BASE)
print(DATASETS)
print(OUT_BASE)

In [None]:
# CONTROLS
RUN_CAMEMBERT = True            # Set to false to skip training Camembert
RUN_CAMEMBERT_PRETRAINED = False  # Set to false to skip training Camembert pretrained

USE_HUGGING_FACE_DATASET = True

# Number of times a model will be trained & evaluated on each of the 8 trainsets.
N_RUNS = 5

In [None]:
assert RUN_CAMEMBERT != RUN_CAMEMBERT_PRETRAINED

In [None]:
if RUN_CAMEMBERT:
    MODEL = "Jean-Baptiste/camembert-ner"
    MODEL_NAME = 'camembert_ner'
    FOLDER = f"11-flat-ner-ref-{MODEL_NAME}-testest"
    INPUT_DIR = "nlpso/m0_fine_tuning_ref_cmbert_io"
    
if RUN_CAMEMBERT_PRETRAINED:
    MODEL = "HueyNemud/das22-10-camembert_pretrained"
    MODEL_NAME = 'pretrained_camembert_ner'
    FOLDER = f"12-flat-ner-ref-{MODEL_NAME}"
    INPUT_DIR = "nlpso/m0_fine_tuning_ref_ptrn_cmbert_io"

## Constants

In [None]:
import os
from pathlib import Path
from config import logger

TRAINSETS_SIZES = [6084]
METRICS_OUTPUT_DIR = OUT_BASE / "10-experiment_1_metrics"
INPUT_DIR, METRICS_OUTPUT_DIR

## 10 - Train and eval on reference dataset

In [None]:
# COMMON CONSTANTS

TRAINING_CONFIG = {
    "evaluation_strategy": "steps",
    "eval_steps": 100,
    "max_steps": 5000,
    "learning_rate": 1e-4,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "weight_decay": 1e-5,
    "load_best_model_at_end": True,
    "greater_is_better":True,
    "metric_for_best_model": "f1",
    "save_strategy": "steps",
    "save_steps": 100,
    "save_total_limit": 1,
}

In [None]:
from config import logger
from datasets import load_from_disk, load_dataset
import json
from camembert_util import init_model, train_eval_loop, _convert_tokenizer

def train_bert(metrics_output_directory,huggingface_dataset=True):
    # Train & evaluate loop
    for run in range(1, N_RUNS + 1):
        output_dir = metrics_output_directory / f"run_{run}"
        output_dir.mkdir(exist_ok=True)

        for trainset_size in TRAINSETS_SIZES:
            if huggingface_dataset == True:
                datasetdir = INPUT_DIR
            else:
                datasetdir = INPUT_DIR / f"huggingface_{trainset_size}"
            logger.info(f"Running on datasets in {datasetdir}")
            logger.info(f"Metrics will be saved in {output_dir}")
            
            model, tokenizer, training_args = init_model(MODEL, local_config,run)
            logger.info(f"{model} #{run}, will save in {output_dir}")
            
            if huggingface_dataset == True:
                train_dev_test = load_dataset(datasetdir)
            else:
                train_dev_test = load_from_disk(datasetdir)
            
            train = train_dev_test["train"]
            dev = train_dev_test["dev"]
            test = train_dev_test["test"]
            metrics = train_eval_loop(model,         # Implicit. Must be setbefore calling train_bert()
                                      training_args, # Idem
                                      tokenizer,
                                      train,dev,test)

            # Save the dev and test metrics
            metrics_file = output_dir / f"test_{trainset_size}.json"
            with open(metrics_file, "w", encoding="utf-8") as o:
                json.dump(metrics[0], o)

            metrics_file = output_dir / f"dev_{trainset_size}.json"
            with open(metrics_file, "w", encoding="utf-8") as o:
                json.dump(metrics[1], o)
                
            torch.cuda.empty_cache()

## 11 - CamemBERT - train & eval on reference dataset

In [None]:
import time
import datetime

if RUN_CAMEMBERT:
    assert _convert_tokenizer.name_or_path == MODEL
    
    # MODEL CONSTS
    MODEL_METRICS_DIR = METRICS_OUTPUT_DIR / f"{FOLDER}"
    MODEL_METRICS_DIR.mkdir(exist_ok=True, parents=True)
    MODEL_OUTPUT_MODEL_PATH = OUT_BASE / f"tmp/{FOLDER}"
    print(MODEL_METRICS_DIR, MODEL_OUTPUT_MODEL_PATH)
    
    # Set config output dir
    local_config = TRAINING_CONFIG.copy() 
    local_config["output_dir"]=MODEL_OUTPUT_MODEL_PATH

    # Run the main loop
    h = time.time()
    train_bert(MODEL_METRICS_DIR,USE_HUGGING_FACE_DATASET)
    runtime = (time.time()- h)/N_RUNS
    print(f"Run-time is equal to {str(datetime.timedelta(seconds=runtime))}")
    torch.cuda.empty_cache()
else:
    print("Skipped finetuning model for IO labels"")

## 12 - CamemBERT pretrained - train & eval

In [None]:
import time
import datetime

if RUN_CAMEMBERT_PRETRAINED:
    assert _convert_tokenizer.name_or_path == MODEL
    
    # MODEL CONSTS
    MODEL_METRICS_DIR = METRICS_OUTPUT_DIR / f"{FOLDER}"
    MODEL_METRICS_DIR.mkdir(exist_ok=True, parents=True)
    MODEL_OUTPUT_MODEL_PATH = OUT_BASE / f"tmp/{FOLDER}"
    MODEL_METRICS_DIR, MODEL_OUTPUT_MODEL_PATH
    print(MODEL_METRICS_DIR, MODEL_OUTPUT_MODEL_PATH)

    # Set config output dir
    local_config = TRAINING_CONFIG.copy() 
    local_config["output_dir"]=MODEL_OUTPUT_MODEL_PATH

    # Run the main loop
    h = time.time()
    train_bert(MODEL_METRICS_DIR,USE_HUGGING_FACE_DATASET)
    runtime = (time.time()- h)/N_RUNS
    print(f"Run-time is equal to {str(datetime.timedelta(seconds=runtime))}")
    torch.cuda.empty_cache()
else:
    print("Skipped finetuning pretrained model for IO labels")