# 40 - Experiment #2

## Initialisation

In [None]:
""" RUN THIS BLOCK ONLY ON GOOGLE COLAB """

# `GDRIVE_PAPER_FOLDER` is the relative path in your GDrive to the folder
# contaning the code of the paper
# ADAPT TO YOUR SITUATION !
%env GDRIVE_PAPER_FOLDER=TEST

# Mount Google Drive to your Colab environment. May require to log in to Google.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Copy the Python modules in `PATH_TO_SOURCES/src/ner/util` to GColab
# to enable import.
!cp -r /content/drive/MyDrive/$GDRIVE_PAPER_FOLDER/src/ner/util .

# Install dependencies
!pip install -q datasets transformers[sentencepiece] sequeval

In [2]:
""" Loads the configuration """

# Set to 1/true/ to set the logging level of nerlogger to DEBUG 
# and save the the spacy datasets as TXT along with the .spacy file
#  for easier debug of the training set generation.
%env DEBUG=1

# If True, activates a set of assertions in the notebooks to ensure
# that the scripts runs with the parameters used in the paper.
%env AS_IN_THE_PAPER = True

import util.config as config

config.show()

24/05/2022 09:29:47 ; INFO ; BASEDIR: /content/drive/MyDrive/TEST
24/05/2022 09:29:47 ; INFO ; Input datasets will be loaded from DATASETDIR /content/drive/MyDrive/TEST/dataset
24/05/2022 09:29:47 ; INFO ; Training data and models will be saved to NERDIR /content/drive/MyDrive/TEST/src/ner
24/05/2022 09:29:47 ; INFO ; Debug mode is ON
24/05/2022 09:29:47 ; INFO ; Random seed: 42
24/05/2022 09:29:47 ; INFO ; Enable reproducibility checks: True


env: DEBUG=1
env: AS_IN_THE_PAPER=True


In [3]:
# Same training configuration as in 20-experiment_1
TRAINING_CONFIG = {
    "evaluation_strategy": "steps",
    "eval_steps": 100,
    "max_steps": 5000,
    "learning_rate": 1e-4,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "weight_decay": 1e-5,
    "load_best_model_at_end": True,
    "greater_is_better":True,
    "metric_for_best_model": "f1",
    "save_strategy": "steps",
    "save_steps": 100, # Make Early callback bug ?
    "save_total_limit": 1,
}

# Training set to fine-tune the NER layer
REF_GOLD_DATASET = config.NERDIR / "02-experiment_2_prepared_datasets/huggingface_ref"
PERO_GOLD_DATASET = config.NERDIR / "02-experiment_2_prepared_datasets/huggingface_pero"
assert REF_GOLD_DATASET.exists()
assert PERO_GOLD_DATASET.exists()

REF_GOLD_DATASET, PERO_GOLD_DATASET

(PosixPath('/content/drive/MyDrive/TEST/src/ner/02-experiment_2_prepared_datasets/huggingface_ref'),
 PosixPath('/content/drive/MyDrive/TEST/src/ner/02-experiment_2_prepared_datasets/huggingface_pero'))

## 41. Fine-tuning CamemBERT-pretrained for NER on the REFERENCE gold set (manually corrected entries)

In [None]:
from datasets import load_from_disk
from util.camembert_util import train_eval_loop, init_model

# PATHS
MODEL_NAME = config.NERDIR / "10-camembert_pretrained_model"

train_dev_test = load_from_disk(REF_GOLD_DATASET)

local_config = TRAINING_CONFIG.copy()
local_config["output_dir"] = config.NERDIR / "41-camembert_pretrained_finetuned_ref"

# Get the model components
model, tokenizer, training_args = init_model(str(MODEL_NAME), local_config)

# Run train eval
metrics = train_eval_loop(model,
                          training_args, 
                          tokenizer,     
                          **train_dev_test, 
                          patience=5)
metrics

Run the following if you want to download the model from GColab. It might take a while.

In [None]:

!zip -r "/tmp/41-experiment_2_finetuned_camembert_pretrained.zip" $FINETUNED_MODEL_OUTPUT_DIR

from google.colab import files
files.download("/tmp/41-experiment_2_finetuned_camembert_pretrained.zip")

## 42 - Fine-tuning CamemBERT for NER on the REFERENCE gold set (manually corrected entries)

In [None]:
from datasets import load_from_disk
from util.camembert_util import train_eval_loop, init_model

MODEL_NAME = "Jean-Baptiste/camembert-ner"

local_config = TRAINING_CONFIG.copy()
local_config["output_dir"] = config.NERDIR / "42-camembert_finetuned_ref" 

train_dev_test = load_from_disk(REF_GOLD_DATASET)

# Get the model components
model, tokenizer, training_args = init_model(str(MODEL_NAME), local_config)

# Run train eval
metrics = train_eval_loop(model,
                          training_args, 
                          tokenizer,     
                          **train_dev_test,
                          patience=5)
metrics

## 43 - Fine-tuning CamemBERT-pretrained for NER on the PERO-OCR gold set (noisy data)

In [None]:
from datasets import load_from_disk
from util.camembert_util import train_eval_loop, init_model

MODEL_NAME = "HueyNemud/das22-10-camembert_pretrained"

local_config = TRAINING_CONFIG.copy()
local_config["output_dir"] = config.NERDIR / "43-camembert_pretrained_finetuned_pero"

train_dev_test = load_from_disk(PERO_GOLD_DATASET)

# Get the model components
model, tokenizer, training_args = init_model(str(MODEL_NAME), local_config)

# Run train eval
metrics = train_eval_loop(model,
                          training_args, 
                          tokenizer,     
                          **train_dev_test,
                          patience=5)
metrics

## 44 - Fine-tuning CamemBERT for NER on the PERO-OCR gold set (noisy data)

In [None]:
from datasets import load_from_disk
from util.camembert_util import train_eval_loop, init_model

MODEL_NAME = "Jean-Baptiste/camembert-ner"

local_config = TRAINING_CONFIG.copy()
local_config["output_dir"] = config.NERDIR / "44-camembert_finetuned_pero"

train_dev_test = load_from_disk(PERO_GOLD_DATASET)

# Get the model components
model, tokenizer, training_args = init_model(str(MODEL_NAME), local_config)

# Run train eval
metrics = train_eval_loop(model,
                          training_args, 
                          tokenizer,     
                          **train_dev_test,
                          patience=5)
metrics

## 45 - Evaluate CamemBERT-pretrained and camembert (simple) on the REF, PERO and TESS test datasets

Load the model fine-tuned on the ref data from `BASE/41-experiment_2_finetuned_camembert_pretrained`.

If you prefer to use the model shared on huggingface.co, replace `model_path` with `HueyNemud/HueyNemud/das2022-41-berties-pretrained-finetuned-ref`. 


In [None]:
REF_GOLD_DATASET = config.NERDIR / "02-experiment_2_prepared_datasets/huggingface_ref"
PERO_GOLD_DATASET = config.NERDIR / "02-experiment_2_prepared_datasets/huggingface_pero"
TESS_GOLD_DATASET = config.NERDIR / "02-experiment_2_prepared_datasets/huggingface_tess"

REF_GOLD_DATASET, PERO_GOLD_DATASET, TESS_GOLD_DATASET 

In [None]:
# Where to store the computed metrics
METRICS_OUTPUT = config.NERDIR / "45-experiment_2_metrics"
METRICS_OUTPUT.mkdir(exist_ok=True, parents=True)
METRICS_OUTPUT

In [11]:
#Helper func to predict & save the computed metrics
import json
from datasets import load_from_disk

def eval_save(dataset_path, name):
    train_dev_test = load_from_disk(dataset_path)
    predictions_ref = trainer.predict(train_dev_test["test"])

    with open( METRICS_OUTPUT / f"camembert_{name}.json" ,"w", encoding="utf-8") as pf:
        js = json.dumps(predictions_ref.metrics)
        pf.write(js)
        js

### Evaluate 41-camembert_pretrained_finetuned_ref

In [None]:
# We won't train so keep it minimalistic
dummy_config = {"output_dir": "/tmp"}

# Load the fine-tuned model
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer
from util.camembert_util import compute_metrics, init_model


model_path = config.NERDIR / "41-camembert_pretrained_finetuned_ref"
model, tokenizer, training_args = init_model(str(model_path), dummy_config)

model.eval() # Switch to evaluation mode

data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model,
    training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
eval_save(REF_GOLD_DATASET, "pretrained_ref_test_ref")
eval_save(PERO_GOLD_DATASET, "pretrained_ref_test_pero")
eval_save(TESS_GOLD_DATASET, "pretrained_ref_test_tess")

### Evaluate 42-camembert_finetuned_ref

In [None]:
# Load the fine-tuned model
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer
from util.camembert_util import compute_metrics

# We won't train so keep it minimalistic
dummy_config = {"output_dir": "/tmp"}

model_path = config.NERDIR / "42-camembert_finetuned_ref"
model, tokenizer, training_args = init_model(str(model_path), dummy_config)

model.eval() # Switch to evaluation mode

data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model,
    training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
eval_save(REF_GOLD_DATASET, "ref_test_ref")
eval_save(PERO_GOLD_DATASET, "ref_test_pero")
eval_save(TESS_GOLD_DATASET, "ref_test_tess")

### Evaluate 43-camembert_pretrained_finetuned_pero

In [None]:
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer
from util.camembert_util import compute_metrics

# We won't train so keep it minimalistic
dummy_config = {"output_dir": "/tmp"}

model_path = config.NERDIR / "43-camembert_pretrained_finetuned_pero"
model, tokenizer, training_args = init_model(str(model_path), dummy_config)

model.eval() # Switch to evaluation mode

data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model,
    training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
eval_save(REF_GOLD_DATASET, "pretrained_pero_test_ref")
eval_save(PERO_GOLD_DATASET, "pretrained_pero_test_pero")
eval_save(TESS_GOLD_DATASET, "pretrained_pero_test_tess")

### Evaluate 44-camembert_finetuned_pero

In [None]:
# Load the fine-tuned model
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer
from util.camembert_util import compute_metrics

# We won't train so keep it minimalistic
dummy_config = {"output_dir": "/tmp"}

model_path = config.NERDIR / "44-camembert_finetuned_pero"
model, tokenizer, training_args = init_model(str(model_path), dummy_config)

model.eval() # Switch to evaluation mode

data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model,
    training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
eval_save(REF_GOLD_DATASET, "pero_test_ref")
eval_save(PERO_GOLD_DATASET, "pero_test_pero")
eval_save(TESS_GOLD_DATASET, "pero_test_tess")