# 20 - Experiment #1

**Requirements** 

Do not forget to install the SpaCy pipeline `fr_core_news_lg` before running Experiment 1:
```bash
python -m spacy download fr_core_news_lg
```
---

In [1]:
import os
from pathlib import Path

# CONSTANTS
N_RUNS = 1 # Run the trainings only once per trainset size 

# Expected datasets indexed by number of examples in the trainset
#INTPUT_DATASETS_SIZES = [49,99,199,398,769,1593,3186,6373]
INPUT_DATASETS_SIZES = [398]

WORK_DIR = Path(os.path.dirname(os.path.realpath("__file__"))) / "20-experiment_1"


# SPACY
SPACY_NER_METRICS_DIR = WORK_DIR / "21-spacy_ner_metrics"
SPACY_NER_METRICS_DIR.mkdir(exist_ok=True, parents=True)

SPACY_USE_GPU = -1

## 21. SpaCy NER pipeline - train & eval

Train the SpaCy pipeline on the NER task for each trainset created in step 00.

Chage N_RUNS to do multiple runs.

In [2]:
from spacy.cli import train, evaluate
from config import logger

# Train & evaluate loop
for run in range(1, N_RUNS + 1):
    RUN_OUTPUT_DIR = SPACY_NER_METRICS_DIR / f"run_{run}"
    RUN_OUTPUT_DIR.mkdir(exist_ok=True)
    
    logger.info(f"SpaCy run #{run}, will save in {RUN_OUTPUT_DIR}")
    
    for trainset_size in INPUT_DATASETS_SIZES:
        # paths to datasets
        trainset = WORK_DIR / f"spacy_train_{trainset_size}.spacy"
        devset = WORK_DIR / f"spacy_dev_{trainset_size}.spacy"
        testset = WORK_DIR / f"spacy_test_{trainset_size}.spacy"

        # Pass train & dev paths as SpaCy config items
        spacy_opts = {
            "paths.train": str(trainset),
            "paths.dev": str(devset),
        }
        
        # Train now !
        train.train("cnn_config.cfg",       # The pipeline configuration file
                    RUN_OUTPUT_DIR,         # save model_best and model_last here
                    use_gpu=SPACY_USE_GPU,  # Use GPU if asked
                    overrides=spacy_opts)   # Pass training options

        model_best = RUN_OUTPUT_DIR / "model_best"
        
        # Compute metrics on the test set
        metrics_file = RUN_OUTPUT_DIR / f"test_{trainsets}.json"
        evaluate(model_best,                      # Where is the trained model
                 testset,                         # Test dataset
                 metrics_file,                    # Save metrics here
                 use_gpu=SPACY_USE_GPU,           # Use GPU if asked
                 displacy_path=RUN_OUTPUT_DIR,    # Save a few tagged results to be shown with displacy
                 displacy_limit=100)              # How much is "a few"
        
        # Compute metrics on the dev set
        metrics_file = RUN_OUTPUT_DIR / f"dev_{trainsets}.json"
        evaluate(model_best,                      # Where is the trained model
                 devset,                          # Dev dataset
                 metrics_file,                    # Save metrics here
                 use_gpu=SPACY_USE_GPU,           # Use GPU if asked
                 displacy_path=RUN_OUTPUT_DIR,    # Save a few tagged results to be shown with displacy
                 displacy_limit=100)              # How much is "a few"


16/01/2022 01:31:51 ; INFO ; SpaCy run #1, will save in /home/bertrand/dev/paper-ner-bench-das22/src/ner/20-experiment_1/21-spacy_ner_metrics/run_1


[38;5;4mℹ Saving to output directory:
/home/bertrand/dev/paper-ner-bench-das22/src/ner/20-experiment_1/21-spacy_ner_metrics/run_1[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
[38;5;3m⚠ Aborting and saving the final best model. Encountered exception:
FileNotFoundError(2, 'No such file or directory')[0m


FileNotFoundError: [Errno 2] No such file or directory: '/home/bertrand/dev/paper-ner-bench-das22/src/ner/20-experiment_1/spacy_train_398.spacy'

## 22. CamemBERT - Common

In [None]:
# COMMON CONSTANTS

TRAINING_CONFIG = {
    "evaluation_strategy": "steps",
    "eval_steps": 100,
    "max_steps": 5000,
    "learning_rate": 1e-4,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "weight_decay": 1e-5,
    "load_best_model_at_end": True,
    "greater_is_better":True,
    "metric_for_best_model": "f1",
    "save_strategy": "steps",
    "save_steps": 100, # Make Early callback bug ?
    "save_total_limit": 1,
}

In [None]:
from config import logger
from datasets import load_from_disk
from camembert_util import train_eval_loop

def train_bert():
    # Train & evaluate loop
    for run in range(1, N_RUNS + 1):
        RUN_OUTPUT_DIR = CAMEMBERT_METRICS_DIR / f"run_{run}"
        RUN_OUTPUT_DIR.mkdir(exist_ok=True)

        logger.info(f"{model} #{run}, will save in {RUN_OUTPUT_DIR}")

        for trainset_size in INPUT_DATASETS_SIZES:
            datasetdir = WORK_DIR / f"huggingface_{trainset_size}"

            logger.info(f"Running on datasets in {datasetdir}")
            logger.info(f"Metrics will be saved in {RUN_OUTPUT_DIR}")

            # Train now !
            train_dev_test = load_from_disk(datasetdir)
            metrics = train_eval_loop(model, # Implicit
                                      training_args, # Implicit
                                      tokenizer, # Implicit
                                      **train_dev_test)

            # Save the metrics
            metrics_file = RUN_OUTPUT_DIR / f"test_{trainsets}.json"
            with open(f"{metrics_file}_test.json", "w", encoding="utf-8") as o:
                json.dump(metrics[0], o)

            metrics_file = RUN_OUTPUT_DIR / f"dev_{trainsets}.json"
            with open(f"{metrics_file}_dev.json", "w", encoding="utf-8") as o:
                json.dump(metrics[1], o)


                

## 23 - CamemBERT - train & eval

In [None]:
# CAMEMBERT
CAMEMBERT_METRICS_DIR = WORK_DIR / "22-camembert_metrics"
CAMEMBERT_METRICS_DIR.mkdir(exist_ok=True, parents=True)
CAMEMBERT_MODEL = "Jean-Baptiste/camembert-ner"

In [None]:
from camembert_util import init_model

# Get the model components
model, tokenizer, training_args = init_model(CAMEMBERT_MODEL, TRAINING_CONFIG)

# Run the main loop
train_bert()

## 23 - CamemBERT pretrained - train & eval

In [None]:
# CAMEMBERT-PRETRAINED
CAMEMBERT_PRETRAINED_METRICS_DIR = WORK_DIR / "23-camembert_pretrained_metrics"
CAMEMBERT_PRETRAINED_METRICS_DIR.mkdir(exist_ok=True, parents=True)
CAMEMBERT_PRETRAINED_MODEL = "HueyNemud/berties-pretrained-das22"

In [None]:
from camembert import init_model

# Get the model components
model, tokenizer, training_args = init_model(CAMEMBERT_MODEL, TRAINING_CONFIG)

# Run the main loop
train_bert()