# 20 - Experiment #1

In [None]:
# CONTROLS
RUN_SPACY = False                 # Set to false to skip training SpaCy NER
RUN_CAMEMBERT = False             # Set to false to skip training Camembert
RUN_CAMEMBERT_PRETRAINED = True  # Set to false to skip training Camembert pretrained

# Number of times a model will be trained & evaluated on each of the 8 trainsets.
N_RUNS = 1

## Initialisation
Set the BASE path.
If run on Google Colab, will also mout Google Drive to the moutpoint given below

In [4]:
import os, sys
from pathlib import Path

ENV_IS_GOOGLE_COLAB = True if 'google.colab' in str(get_ipython()) else False
os.environ["ENV_IS_GOOGLE_COLAB"] = str(ENV_IS_GOOGLE_COLAB)

if ENV_IS_GOOGLE_COLAB:
  from google.colab import drive
  mountpoint = Path("/content/drive")
  drive.mount(str(mountpoint)) # Mount gdrive to BASE
  base = mountpoint / "MyDrive/SODUCO/article_das_2022" # Adapt this to your situation
  sys.path.append(str(base)) # Add BASE to Python Path
else:
  base = os.path.dirname(os.path.realpath("__file__")) # If not on GColab, BASE will be the directory of this notebook

BASE = Path(base).resolve() # Make BASE absolute
print(sys.path)
BASE

['/home/bertrand/dev/paper-ner-bench-das22/src/ner', '/home/bertrand/anaconda3/lib/python39.zip', '/home/bertrand/anaconda3/lib/python3.9', '/home/bertrand/anaconda3/lib/python3.9/lib-dynload', '', '/home/bertrand/anaconda3/lib/python3.9/site-packages', '/home/bertrand/anaconda3/lib/python3.9/site-packages/locket-0.2.1-py3.9.egg', '/home/bertrand/anaconda3/lib/python3.9/site-packages/IPython/extensions', '/home/bertrand/.ipython']


PosixPath('/home/bertrand/dev/paper-ner-bench-das22/src/ner')

In [2]:
!pip install --upgrade transformers datasets spacy transformers[sentencepiece] seqeval



## Constants

In [5]:
import os
from pathlib import Path
from config import logger

# Expected datasets indexed by number of examples in the trainset
TRAINSETS_SIZES = [49,99,199,398,796,1593,3186,6373]

# INPUT / OUTPUT DIRS
INPUT_DIR = BASE / "01-experiment_1_prepared_datasets"
METRICS_OUTPUT_DIR = BASE / "20-experiment_1_metrics"
INPUT_DIR, METRICS_OUTPUT_DIR

17/01/2022 02:41:24 ; INFO ; Train SpaCy NER ? False
17/01/2022 02:41:24 ; INFO ; Train Camembert ? False
17/01/2022 02:41:24 ; INFO ; Train Camembert pretrained ? True


(PosixPath('/home/bertrand/dev/paper-ner-bench-das22/src/ner/01-experiment_1_prepared_datasets'),
 PosixPath('/home/bertrand/dev/paper-ner-bench-das22/src/ner/20-experiment_1_metrics'))

## 21. SpaCy NER pipeline - train & eval

In [6]:
# SPACY CONSTS
SPACY_NER_METRICS_DIR = METRICS_OUTPUT_DIR / "21-spacy_ner"
SPACY_NER_METRICS_DIR.mkdir(exist_ok=True, parents=True)

# Spacy's model be overwritten by each run & for each trainset size.
# The last model will be trained on the largest trainset.
SPACY_OUTPUT_MODEL_PATH = "/tmp"
SPACY_USE_GPU = -1

In [7]:
# Download SpaCy pipeline only if needed
if RUN_SPACY:
  !python -m spacy download fr_core_news_lg

In [8]:
if RUN_SPACY:
  from spacy.cli import train, evaluate
  from config import logger

  # Train & evaluate loop
  for run in range(1, N_RUNS + 1):
      output_dir = SPACY_NER_METRICS_DIR / f"run_{run}"
      output_dir.mkdir(exist_ok=True)
      
      logger.info(f"SpaCy run #{run}, will save in {output_dir}")
      
      for trainset_size in TRAINSETS_SIZES:
          # paths to datasets
          trainset = INPUT_DIR / f"spacy_train_{trainset_size}.spacy"
          devset = INPUT_DIR / f"spacy_dev_{trainset_size}.spacy"
          testset = INPUT_DIR / f"spacy_test_{trainset_size}.spacy"

          # Pass train & dev paths as SpaCy config items
          spacy_opts = {
              "paths.train": str(trainset),
              "paths.dev": str(devset),
          }
          
          # Train now !
          train.train("cnn_config.cfg",       # The pipeline configuration file
                      SPACY_OUTPUT_MODEL_PATH,# save model-best and model-last here
                      use_gpu=SPACY_USE_GPU,  # Use GPU if asked
                      overrides=spacy_opts)   # Pass training options

          model_best = SPACY_OUTPUT_MODEL_PATH / "model-best"
          
          # Compute metrics on the test set
          metrics_file = output_dir / f"test_{trainset_size}.json"
          evaluate(model_best,                      # Where is the trained model
                  testset,                         # Test dataset
                  metrics_file,                    # Save metrics here
                  use_gpu=SPACY_USE_GPU,           # Use GPU if asked
                  displacy_path=output_dir,        # Save a few tagged results to be shown with displacy
                  displacy_limit=100)              # How much is "a few"
          
          # Compute metrics on the dev set
          metrics_file = output_dir / f"dev_{trainset_size}.json"
          evaluate(model_best,                      # Where is the trained model
                  devset,                          # Dev dataset
                  metrics_file,                    # Save metrics here
                  use_gpu=SPACY_USE_GPU,           # Use GPU if asked
                  displacy_path=output_dir,        # Save a few tagged results to be shown with displacy
                  displacy_limit=100)              # How much is "a few"
else:
  logger.info("Skipped finetuning SpaCy NER")

17/01/2022 02:41:33 ; INFO ; Skipped finetuning SpaCy NER


## 22. CamemBERT - Common

In [9]:
# COMMON CONSTANTS

TRAINING_CONFIG = {
    "evaluation_strategy": "steps",
    "eval_steps": 100,
    "max_steps": 5000,
    "learning_rate": 1e-4,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "weight_decay": 1e-5,
    "load_best_model_at_end": True,
    "greater_is_better":True,
    "metric_for_best_model": "f1",
    "save_strategy": "steps",
    "save_steps": 100, # Make Early callback bug ?
    "save_total_limit": 1,
}

In [10]:
from config import logger
from datasets import load_from_disk
from camembert_util import train_eval_loop
import json

def train_bert(metrics_output_directory):
    # Train & evaluate loop
    for run in range(1, N_RUNS + 1):
        output_dir = metrics_output_directory / f"run_{run}"
        output_dir.mkdir(exist_ok=True)

        logger.info(f"{model} #{run}, will save in {output_dir}")

        for trainset_size in TRAINSETS_SIZES:
            datasetdir = INPUT_DIR / f"huggingface_{trainset_size}"

            logger.info(f"Running on datasets in {datasetdir}")
            logger.info(f"Metrics will be saved in {output_dir}")

            # Train now !
            train_dev_test = load_from_disk(datasetdir)
            metrics = train_eval_loop(model,         # Implicit. Must be setbefore calling train_bert()
                                      training_args, # Idem
                                      tokenizer,     # idem
                                      **train_dev_test)

            # Save the metrics
            metrics_file = output_dir / f"test_{trainset_size}.json"
            with open(metrics_file, "w", encoding="utf-8") as o:
                json.dump(metrics[0], o)

            metrics_file = output_dir / f"dev_{trainset_size}.json"
            with open(metrics_file, "w", encoding="utf-8") as o:
                json.dump(metrics[1], o)


                

[nltk_data] Downloading package punkt to /home/bertrand/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 23 - CamemBERT - train & eval

In [11]:
# CAMEMBERT CONSTS
CAMEMBERT_METRICS_DIR = METRICS_OUTPUT_DIR / "22-camembert"
CAMEMBERT_METRICS_DIR.mkdir(exist_ok=True, parents=True)
CAMEMBERT_MODEL = "Jean-Baptiste/camembert-ner"
CAMEMBERT_OUTPUT_MODEL_PATH = "/tmp/22-camembert"
CAMEMBERT_METRICS_DIR, CAMEMBERT_OUTPUT_MODEL_PATH

(PosixPath('/home/bertrand/dev/paper-ner-bench-das22/src/ner/20-experiment_1_metrics/22-camembert'),
 '/tmp/22-camembert')

In [12]:
if RUN_CAMEMBERT:
  from camembert_util import init_model

  # Set config output dir
  local_config = TRAINING_CONFIG.copy() 
  local_config["output_dir"]=CAMEMBERT_OUTPUT_MODEL_PATH

  # Get the model components
  model, tokenizer, training_args = init_model(CAMEMBERT_MODEL, local_config)

  # Run the main loop
  train_bert(CAMEMBERT_METRICS_DIR)
else:
  logger.info("Skipped finetuning Camembert")

17/01/2022 02:41:50 ; INFO ; Skipped finetuning Camembert


## 23 - CamemBERT pretrained - train & eval

In [13]:
# CAMEMBERT PRETRAINED CONSTS
CAMEMBERT_PRETRAINED_METRICS_DIR = METRICS_OUTPUT_DIR / "23-camembert_pretrained"
CAMEMBERT_PRETRAINED_METRICS_DIR.mkdir(exist_ok=True, parents=True)
CAMEMBERT_PRETRAINED_MODEL = "HueyNemud/das22-10-camembert_pretrained"
CAMEMBERT_PRETRAINED_OUTPUT_MODEL_PATH = "/tmp/22-camembert_pretrained"

In [14]:
if RUN_CAMEMBERT_PRETRAINED:
  from camembert_util import init_model

  # Set config output dir
  local_config = TRAINING_CONFIG.copy() 
  local_config["output_dir"]=CAMEMBERT_PRETRAINED_OUTPUT_MODEL_PATH

  # Get the model components
  model, tokenizer, training_args = init_model(CAMEMBERT_PRETRAINED_MODEL, local_config)

  # Run the main loop
  train_bert(CAMEMBERT_PRETRAINED_METRICS_DIR)
else:
  logger.info("Skipped finetuning Camembert-pretraining")

17/01/2022 02:41:56 ; INFO ; Model HueyNemud/das22-10-camembert_pretrained


Downloading:   0%|          | 0.00/671 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/210 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/958 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at HueyNemud/das22-10-camembert_pretrained were not used when initializing CamembertForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing CamembertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at HueyNemud/das22-10-camembert_pretrained and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this mo

Step,Training Loss,Validation Loss


KeyboardInterrupt: 