# 20 - Experiment #1

This notebook contains the code to perform the experiment #1 explained in the section #5 of the paper.

In [3]:
""" MAIN CONTROLS """

# CONTROLS
RUN_SPACY = True                  # Set to false to skip training SpaCy NER
RUN_CAMEMBERT = True             # Set to false to skip training Camembert
RUN_CAMEMBERT_PRETRAINED = True  # Set to false to skip training Camembert pretrained

# Number of times a model will be trained & evaluated on each of the 8 trainsets.
N_RUNS = 1

## Initialisation

The initialisation step:
- set up the environment on Google Colab (optional).
- sets the random seed SPLIT_SEED to use in all training set generation to ensure repeatable results
- creates logger named nerlogger
- defines the paths to the directories used by the NER notebooks
- imports all the modules used in this notebook


In [None]:
""" RUN THIS BLOCK ONLY ON GOOGLE COLAB """

# `GDRIVE_PAPER_FOLDER` is the relative path in your GDrive to the folder
# contaning the code of the paper
# ADAPT TO YOUR SITUATION !
%env GDRIVE_PAPER_FOLDER=TEST

# Mount Google Drive to your Colab environment. May require to log in to Google.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Copy the Python modules in `PATH_TO_SOURCES/src/ner/util` to GColab
# to enable import.
!cp -r /content/drive/MyDrive/$GDRIVE_PAPER_FOLDER/src/ner/util .

# Copy the spacy configuration
!cp -r /content/drive/MyDrive/$GDRIVE_PAPER_FOLDER/src/ner/cnn_config.cfg .

# Install dependencies
!pip install -q datasets transformers[sentencepiece]
# Force update SpaCy to v3 and NLTK
!pip install -qU spacy

In [5]:
""" Loads the configuration """

# Set to 1/true/ to set the logging level of nerlogger to DEBUG 
# and save the the spacy datasets as TXT along with the .spacy file
#  for easier debug of the training set generation.
%env DEBUG=1

# If True, activates a set of assertions in the notebooks to ensure
# that the scripts runs with the parameters used in the paper.
%env AS_IN_THE_PAPER = True

import util.config as config

config.show()


23/05/2022 04:07:50 ; INFO ; BASEDIR: /content/drive/MyDrive/TEST
23/05/2022 04:07:50 ; INFO ; Input datasets will be loaded from DATASETDIR /content/drive/MyDrive/TEST/dataset
23/05/2022 04:07:50 ; INFO ; Training data and models will be saved to NERDIR /content/drive/MyDrive/TEST/src/ner
23/05/2022 04:07:50 ; INFO ; Debug mode is ON
23/05/2022 04:07:50 ; INFO ; Random seed: 42
23/05/2022 04:07:50 ; INFO ; Enable reproducibility checks: True


env: DEBUG=1
env: AS_IN_THE_PAPER=True


In [6]:
""" Import all modules at once """

# General imports
import os
import pathlib
import nltk
import tempfile
import json

# NER imports
from util.as_in_the_paper import assert_expected

# Expected datasets indexed by number of examples in the trainset
TRAINSETS_SIZES = [49,99,199,398,796,1593,3186,6373]

# INPUT / OUTPUT DIRS
INPUT_DIR = config.NERDIR / "01-experiment_1_prepared_datasets"
METRICS_OUTPUT_DIR = config.NERDIR / "20-experiment_1_metrics"
INPUT_DIR, METRICS_OUTPUT_DIR

(PosixPath('/content/drive/MyDrive/TEST/src/ner/01-experiment_1_prepared_datasets'),
 PosixPath('/content/drive/MyDrive/TEST/src/ner/20-experiment_1_metrics'))

## 21. SpaCy NER pipeline - train & eval

In [7]:
# SPACY CONSTS
SPACY_NER_METRICS_DIR = METRICS_OUTPUT_DIR / "21-spacy_ner"
SPACY_NER_METRICS_DIR.mkdir(exist_ok=True, parents=True)

# Spacy's model be overwritten by each run & for each trainset size.
# The last model will be trained on the largest trainset.
SPACY_OUTPUT_MODEL_PATH = pathlib.Path(tempfile.gettempdir())
SPACY_USE_GPU = -1

In [10]:
""" SpaCy preparation """
if RUN_SPACY:
  # Download SpaCy pipeline
  !python -m spacy download fr_core_news_lg
  


Collecting fr-core-news-lg==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_lg-3.3.0/fr_core_news_lg-3.3.0-py3-none-any.whl (571.8 MB)
[K     |████████████████████████████████| 571.8 MB 12 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_lg')


In [None]:
if RUN_SPACY:
  from spacy.cli import train, evaluate

  # Train & evaluate loop
  for run in range(1, N_RUNS + 1):
      output_dir = SPACY_NER_METRICS_DIR / f"run_{run}"
      output_dir.mkdir(exist_ok=True)
      
      config.logger.info(f"SpaCy run #{run}, will save in {output_dir}")
      
      for trainset_size in TRAINSETS_SIZES:
          # paths to datasets
          trainset = INPUT_DIR / f"spacy_train_{trainset_size}.spacy"
          devset = INPUT_DIR / f"spacy_dev_{trainset_size}.spacy"
          testset = INPUT_DIR / f"spacy_test_{trainset_size}.spacy"

          # Pass train & dev paths as SpaCy config items
          spacy_opts = {
              "paths.train": str(trainset),
              "paths.dev": str(devset),
          }
          
          # Train now !
          train.train("cnn_config.cfg",       # The pipeline configuration file
                      SPACY_OUTPUT_MODEL_PATH,# save model-best and model-last here
                      use_gpu=SPACY_USE_GPU,  # Use GPU if asked
                      overrides=spacy_opts)   # Pass training options

          model_best = SPACY_OUTPUT_MODEL_PATH / "model-best"
          
          # Compute metrics on the test set
          metrics_file = output_dir / f"test_{trainset_size}.json"
          evaluate(model_best,                     # Where is the trained model
                  testset,                         # Test dataset
                  metrics_file,                    # Save metrics here
                  use_gpu=SPACY_USE_GPU,           # Use GPU if asked
                  displacy_path=output_dir,        # Save a few tagged results to be shown with displacy
                  displacy_limit=100)              # How much is "a few"
          
          # Compute metrics on the dev set
          metrics_file = output_dir / f"dev_{trainset_size}.json"
          evaluate(model_best,                      # Where is the trained model
                  devset,                          # Dev dataset
                  metrics_file,                    # Save metrics here
                  use_gpu=SPACY_USE_GPU,           # Use GPU if asked
                  displacy_path=output_dir,        # Save a few tagged results to be shown with displacy
                  displacy_limit=100)              # How much is "a few"
else:
  config.logger.info("Skipped finetuning SpaCy NER")

23/05/2022 04:11:52 ; INFO ; SpaCy run #1, will save in /content/drive/MyDrive/TEST/src/ner/20-experiment_1_metrics/21-spacy_ner/run_1


[38;5;4mℹ Saving to output directory: /tmp[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     58.05   25.66   33.72   20.71    0.26
 36     200          0.00   1323.91   82.20   80.51   83.96    0.82
 80     400          0.00      0.00   82.63   81.37   83.93    0.83
131     600          0.00      0.01   82.49   81.40   83.61    0.82
197     800          0.00      0.00   81.98   80.79   83.21    0.82
273    1000          0.00      0.00   81.90   80.25   83.61    0.82
373    1200          0.00      0.00   82.18   80.60   83.82    0.82
473    1400          0.00      0.00   81.91   80.28   83.61    0.82
601    1600          0.00      0.00   82.59   81.10   84.14    0.83
801    1800          0.00      0.00   82.08   80

## 22. CamemBERT - Common

In [None]:
# SHARED CONSTANTS

TRAINING_CONFIG = {
    "evaluation_strategy": "steps",
    "eval_steps": 100,
    "max_steps": 5000,
    "learning_rate": 1e-4,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "weight_decay": 1e-5,
    "load_best_model_at_end": True,
    "greater_is_better":True,
    "metric_for_best_model": "f1",
    "save_strategy": "steps",
    "save_steps": 100, # Make Early callback bug ?
    "save_total_limit": 1,
}

In [None]:

def train_bert(metrics_output_directory):
    # Train & evaluate loop
    for run in range(1, N_RUNS + 1):
        output_dir = metrics_output_directory / f"run_{run}"
        output_dir.mkdir(exist_ok=True)

        logger.info(f"{model} #{run}, will save in {output_dir}")

        for trainset_size in TRAINSETS_SIZES:
            datasetdir = INPUT_DIR / f"huggingface_{trainset_size}"

            logger.info(f"Running on datasets in {datasetdir}")
            logger.info(f"Metrics will be saved in {output_dir}")

            # Train now !
            train_dev_test = load_from_disk(datasetdir)
            metrics = train_eval_loop(model,         # Implicit. Must be setbefore calling train_bert()
                                      training_args, # Idem
                                      tokenizer,     # idem
                                      **train_dev_test)

            # Save the metrics
            metrics_file = output_dir / f"test_{trainset_size}.json"
            with open(metrics_file, "w", encoding="utf-8") as o:
                json.dump(metrics[0], o)

            metrics_file = output_dir / f"dev_{trainset_size}.json"
            with open(metrics_file, "w", encoding="utf-8") as o:
                json.dump(metrics[1], o)


                

[nltk_data] Downloading package punkt to /home/bertrand/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 23 - CamemBERT - train & eval

In [None]:
# CAMEMBERT CONSTS
CAMEMBERT_METRICS_DIR = METRICS_OUTPUT_DIR / "22-camembert"
CAMEMBERT_METRICS_DIR.mkdir(exist_ok=True, parents=True)
CAMEMBERT_MODEL = "Jean-Baptiste/camembert-ner"
CAMEMBERT_OUTPUT_MODEL_PATH = "/tmp/22-camembert"
CAMEMBERT_METRICS_DIR, CAMEMBERT_OUTPUT_MODEL_PATH

(PosixPath('/home/bertrand/dev/paper-ner-bench-das22/src/ner/20-experiment_1_metrics/22-camembert'),
 '/tmp/22-camembert')

In [None]:
if RUN_CAMEMBERT:
  from camembert_util import init_model

  # Set config output dir
  local_config = TRAINING_CONFIG.copy() 
  local_config["output_dir"]=CAMEMBERT_OUTPUT_MODEL_PATH

  # Get the model components
  model, tokenizer, training_args = init_model(CAMEMBERT_MODEL, local_config)

  # Run the main loop
  train_bert(CAMEMBERT_METRICS_DIR)
else:
  logger.info("Skipped finetuning Camembert")

17/01/2022 02:41:50 ; INFO ; Skipped finetuning Camembert


## 23 - CamemBERT pretrained - train & eval

In [None]:
# CAMEMBERT PRETRAINED CONSTS
CAMEMBERT_PRETRAINED_METRICS_DIR = METRICS_OUTPUT_DIR / "23-camembert_pretrained"
CAMEMBERT_PRETRAINED_METRICS_DIR.mkdir(exist_ok=True, parents=True)
CAMEMBERT_PRETRAINED_MODEL = "HueyNemud/das22-10-camembert_pretrained"
CAMEMBERT_PRETRAINED_OUTPUT_MODEL_PATH = "/tmp/22-camembert_pretrained"

In [None]:
if RUN_CAMEMBERT_PRETRAINED:
  from camembert_util import init_model

  # Set config output dir
  local_config = TRAINING_CONFIG.copy() 
  local_config["output_dir"]=CAMEMBERT_PRETRAINED_OUTPUT_MODEL_PATH

  # Get the model components
  model, tokenizer, training_args = init_model(CAMEMBERT_PRETRAINED_MODEL, local_config)

  # Run the main loop
  train_bert(CAMEMBERT_PRETRAINED_METRICS_DIR)
else:
  logger.info("Skipped finetuning Camembert-pretraining")

17/01/2022 02:41:56 ; INFO ; Model HueyNemud/das22-10-camembert_pretrained


Downloading:   0%|          | 0.00/671 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/210 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/958 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at HueyNemud/das22-10-camembert_pretrained were not used when initializing CamembertForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing CamembertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at HueyNemud/das22-10-camembert_pretrained and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this mo

Step,Training Loss,Validation Loss


KeyboardInterrupt: 