# 20 - Experiment #1

**Requirements** 

Do not forget to install the SpaCy pipeline `fr_core_news_lg` before running Experiment 1:
```bash
python -m spacy download fr_core_news_lg
```
---

In [1]:
import os
from pathlib import Path

# GLOBAL CONSTANTS
N_RUNS = 1 # Train only once per trainset size. Change this value to run multiple trainings for each model x trainset

# Expected datasets indexed by number of examples in the trainset
TRAINSETS_SIZES = [49,99,199,398,769,1593,3186,6373]
TRAINSETS_SIZES = [199]

# INPUT / OUTPUT DIRS
nb_loc = Path(os.path.dirname(os.path.realpath("__file__"))).resolve()
INPUT_DIR = nb_loc / "00-prepared_datasets/01-experiment_1"
METRICS_OUTPUT_DIR = nb_loc / "20-experiment_1_metrics"


## 21. SpaCy NER pipeline - train & eval

In [2]:
# SPACY CONSTS
SPACY_NER_METRICS_DIR = METRICS_OUTPUT_DIR / "21-spacy_ner"
SPACY_NER_METRICS_DIR.mkdir(exist_ok=True, parents=True)

# Spacy's model be overwritten by each run & for each trainset size.
# The last model will be trained on the largest trainset.
SPACY_SAVE_MODEL_PATH = "/tmp"
SPACY_USE_GPU = -1

In [None]:
from spacy.cli import train, evaluate
from config import logger

# Train & evaluate loop
for run in range(1, N_RUNS + 1):
    output_dir = SPACY_NER_METRICS_DIR / f"run_{run}"
    output_dir.mkdir(exist_ok=True)
    
    logger.info(f"SpaCy run #{run}, will save in {output_dir}")
    
    for trainset_size in TRAINSETS_SIZES:
        # paths to datasets
        trainset = INPUT_DIR / f"spacy_train_{trainset_size}.spacy"
        devset = INPUT_DIR / f"spacy_dev_{trainset_size}.spacy"
        testset = INPUT_DIR / f"spacy_test_{trainset_size}.spacy"

        # Pass train & dev paths as SpaCy config items
        spacy_opts = {
            "paths.train": str(trainset),
            "paths.dev": str(devset),
        }
        
        # Train now !
        train.train("cnn_config.cfg",       # The pipeline configuration file
                    SPACY_SAVE_MODEL_PATH,  # save model-best and model-last here
                    use_gpu=SPACY_USE_GPU,  # Use GPU if asked
                    overrides=spacy_opts)   # Pass training options

        model_best = SPACY_SAVE_MODEL_PATH / "model-best"
        
        # Compute metrics on the test set
        metrics_file = output_dir / f"test_{trainset_size}.json"
        evaluate(model_best,                      # Where is the trained model
                 testset,                         # Test dataset
                 metrics_file,                    # Save metrics here
                 use_gpu=SPACY_USE_GPU,           # Use GPU if asked
                 displacy_path=output_dir,        # Save a few tagged results to be shown with displacy
                 displacy_limit=100)              # How much is "a few"
        
        # Compute metrics on the dev set
        metrics_file = output_dir / f"dev_{trainset_size}.json"
        evaluate(model_best,                      # Where is the trained model
                 devset,                          # Dev dataset
                 metrics_file,                    # Save metrics here
                 use_gpu=SPACY_USE_GPU,           # Use GPU if asked
                 displacy_path=output_dir,        # Save a few tagged results to be shown with displacy
                 displacy_limit=100)              # How much is "a few"


16/01/2022 02:40:29 ; INFO ; SpaCy run #1, will save in /home/bertrand/dev/paper-ner-bench-das22/src/ner/20-experiment_1_metrics/21-spacy_ner/run_1


[38;5;4mℹ Saving to output directory: /tmp[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     55.74   23.37   30.55   18.93    0.23
  8     200          0.00   2289.49   87.86   87.69   88.04    0.88
 18     400          0.00    177.48   88.13   87.86   88.39    0.88
 31     600          0.00     12.94   87.78   87.42   88.14    0.88
 47     800          0.00      4.55   87.81   87.40   88.21    0.88
 66    1000          0.00     13.56   88.15   87.81   88.50    0.88
 90    1200          0.00      2.45   87.86   87.55   88.18    0.88
118    1400          0.00      4.52   87.62   87.17   88.07    0.88
153    1600          0.00     25.64   87.28   87.17   87.39    0.87


## 22. CamemBERT - Common

In [None]:
# COMMON CONSTANTS

TRAINING_CONFIG = {
    "evaluation_strategy": "steps",
    "eval_steps": 100,
    "max_steps": 5000,
    "learning_rate": 1e-4,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "weight_decay": 1e-5,
    "load_best_model_at_end": True,
    "greater_is_better":True,
    "metric_for_best_model": "f1",
    "save_strategy": "steps",
    "save_steps": 100, # Make Early callback bug ?
    "save_total_limit": 1,
}

In [None]:
from config import logger
from datasets import load_from_disk
from camembert_util import train_eval_loop

def train_bert(metrics_output_directory):
    # Train & evaluate loop
    for run in range(1, N_RUNS + 1):
        output_dir = metrics_output_directory / f"run_{run}"
        output_dir.mkdir(exist_ok=True)

        logger.info(f"{model} #{run}, will save in {output_dir}")

        for trainset_size in TRAINSETS_SIZES:
            datasetdir = WORK_DIR / f"huggingface_{trainset_size}"

            logger.info(f"Running on datasets in {datasetdir}")
            logger.info(f"Metrics will be saved in {output_dir}")

            # Train now !
            train_dev_test = load_from_disk(datasetdir)
            metrics = train_eval_loop(model,         # Implicit. Must be setbefore calling train_bert()
                                      training_args, # Idem
                                      tokenizer,     # idem
                                      **train_dev_test)

            # Save the metrics
            metrics_file = output_dir / f"test_{trainset_size}.json"
            with open(metrics_file, "w", encoding="utf-8") as o:
                json.dump(metrics[0], o)

            metrics_file = output_dir / f"dev_{trainset_size}.json"
            with open(metrics_file, "w", encoding="utf-8") as o:
                json.dump(metrics[1], o)


                

## 23 - CamemBERT - train & eval

In [None]:
# CAMEMBERT CONSTS
CAMEMBERT_METRICS_DIR = METRICS_OUTPUT_DIR / "22-camembert"
CAMEMBERT_METRICS_DIR.mkdir(exist_ok=True, parents=True)
CAMEMBERT_MODEL = "Jean-Baptiste/camembert-ner"

In [None]:
from camembert_util import init_model

# Get the model components
model, tokenizer, training_args = init_model(CAMEMBERT_MODEL, TRAINING_CONFIG)

# Run the main loop
train_bert(CAMEMBERT_METRICS_DIR)

## 23 - CamemBERT pretrained - train & eval

In [None]:
# CAMEMBERT PRETRAINED CONSTS
CAMEMBERT_PRETRAINED_METRICS_DIR = METRICS_OUTPUT_DIR / "23-camembert_pretrained"
CAMEMBERT_PRETRAINED_METRICS_DIR.mkdir(exist_ok=True, parents=True)
CAMEMBERT_PRETRAINED_MODEL = "HueyNemud/berties-pretrained-das22"

In [None]:
from camembert import init_model

# Get the model components
model, tokenizer, training_args = init_model(CAMEMBERT_PRETRAINED_MODEL, TRAINING_CONFIG)

# Run the main loop
train_bert(CAMEMBERT_PRETRAINED_METRICS_DIR)